diff --git a/.gitignore b/.gitignore index 3557e5d..06125d6 100644 --- a/.gitignore +++ b/.gitignore @@ -189,3 +189,4 @@ gradle-app.setting /.vs/ node_modules/ +.env diff --git a/GroupFive/__init__.py b/GroupFive/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/GroupFive/admin.py b/GroupFive/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/GroupFive/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/GroupFive/apps.py b/GroupFive/apps.py new file mode 100644 index 0000000..8220433 --- /dev/null +++ b/GroupFive/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class OurApplicationConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'GroupFive' diff --git a/GroupFive/dummy_analysis.py b/GroupFive/dummy_analysis.py new file mode 100644 index 0000000..992ba89 --- /dev/null +++ b/GroupFive/dummy_analysis.py @@ -0,0 +1,13 @@ + +def run_dummy(code, language): + + return { + "summary" : "this dummy code is better than yours", + "findings" : [ + { + "severity" : "Minimal", + "description" : "Bad code", + "fix" : "Figure it Out" + } + ] + } \ No newline at end of file diff --git a/GroupFive/migrations/0001_initial.py b/GroupFive/migrations/0001_initial.py new file mode 100644 index 0000000..b8f9b01 --- /dev/null +++ b/GroupFive/migrations/0001_initial.py @@ -0,0 +1,30 @@ +# Generated by Django 5.0.3 on 2026-02-18 09:51 + +import django.db.models.deletion +import uuid +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name='AnalysisTask', + fields=[ + ('id', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)), + ('input_code', models.TextField()), + ('language', models.CharField(max_length=50)), + ('status', models.CharField(choices=[('QUEUED', 'Queued'), ('RUNNING', 'Running'), ('COMPLETED', 'Completed'), ('FAILED', 'Failed')], max_length=20)), + ('results', models.JSONField(blank=True, null=True)), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + ), + ] diff --git a/GroupFive/migrations/__init__.py b/GroupFive/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/GroupFive/models.py b/GroupFive/models.py new file mode 100644 index 0000000..f6f0ea7 --- /dev/null +++ b/GroupFive/models.py @@ -0,0 +1,21 @@ +#all id related lines are noted and can be deleted or changed if user id is skipped or substituted +import uuid #for user ID +from django.db import models +from django.contrib.auth.models import User + +class AnalysisTask(models.Model): + #potential review request statuses + STATUS_OPT = [ + ("QUEUED", "Queued"), + ("RUNNING", "Running"), + ("COMPLETED", "Completed"), + ("FAILED", "Failed") + ] + + id = models.UUIDField(primary_key=True, default=uuid.uuid4) #more user id + user = models.ForeignKey(User, on_delete=models.CASCADE) #user id/user + input_code = models.TextField() #user provided code + language = models.CharField(max_length=50) #language of user provided code + status = models.CharField(max_length=20, choices=STATUS_OPT) #status of review request + results = models.JSONField(null=True, blank=True) #results of review + created_at = models.DateTimeField(auto_now_add=True) #creation timestamp diff --git a/GroupFive/serializers.py b/GroupFive/serializers.py new file mode 100644 index 0000000..255c15e --- /dev/null +++ b/GroupFive/serializers.py @@ -0,0 +1,7 @@ +#this file uses serializers to define what information we add to our AnalysisTask model from user +from rest_framework import serializers + +class AnalysisRequestSerializer(serializers.Serializer): + code = serializers.CharField() #for input code + #language definition of input code, can be commented out if language distinction added later + language = serializers.CharField() diff --git a/GroupFive/tasks.py b/GroupFive/tasks.py new file mode 100644 index 0000000..24c186e --- /dev/null +++ b/GroupFive/tasks.py @@ -0,0 +1,22 @@ +#from celery import shared_task #task queue to handle simultaneous requests, making testing annoying for now can readd later when necessary +from GroupFive.models import AnalysisTask +from .dummy_analysis import run_dummy + +#@shared_task --from celery, readd later +def run_analysis_async(task_id): + + #instance of analysisTask + task = AnalysisTask.objects.get(id=task_id) + task.status = "RUNNING" #update status + task.save() #save instance task + + try: + #call ai api rather than dummy + results = run_dummy(task.input_code, task.language) + + task.results = results #store results + task.status = "COMPLETED" #update status + except Exception(BaseException) as e: + task.status = "FAILED" + + task.save() \ No newline at end of file diff --git a/GroupFive/tests.py b/GroupFive/tests.py new file mode 100644 index 0000000..5f5da37 --- /dev/null +++ b/GroupFive/tests.py @@ -0,0 +1,60 @@ +from rest_framework.test import APITestCase +from django.contrib.auth.models import User +from rest_framework import status +from .models import * +from uuid import uuid4 + + +class InitialAnalysisTests(APITestCase): + + def setUp(self): + #create user + self.User = User.objects.create_user( + username="username", + password="password" + ) + self.client.login(username="username", password="password") + + def test_create_analysisTask(self): + + response = self.client.post("/api/GroupFive/",{ + "code" : "print('Hello World')", #code to analyze + "language" : "Python" #language of code + }, format="json") + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertIn("task_id", response.data) #task_id in data + self.assertEqual(response.data["status"], "QUEUED") + +class InitialWorkflowTest(APITestCase): + + def setUp(self): + #create user + self.User = User.objects.create_user( + username="username", + password="password" + ) + self.client.login(username="username", password="password") + + def test_initial_workflow(self): + response = self.client.post("/api/GroupFive/",{ + "code" : "print('Hello Again')", #code to analyze + "language" : "Python" #language of code + }, format="json") + + self.assertEqual(response.status_code, status.HTTP_200_OK) + + task_id = response.data["task_id"] + + task = AnalysisTask.objects.get(id=task_id) + + #confirm that dummy ran + self.assertEqual(task.status, "COMPLETED") + + result_response = self.client.get(f"/api/GroupFive/{task_id}") + + #ensure task endpoint + self.assertEqual(result_response.status_code, 200) + + + diff --git a/GroupFive/views.py b/GroupFive/views.py new file mode 100644 index 0000000..01cfa8e --- /dev/null +++ b/GroupFive/views.py @@ -0,0 +1,42 @@ +#all id related lines are noted and can be deleted or changed if user id is skipped or substituted +from rest_framework.views import APIView +from rest_framework.response import Response +from rest_framework.permissions import IsAuthenticated #for user id +from GroupFive.models import AnalysisTask +from GroupFive.serializers import AnalysisRequestSerializer +from .tasks import run_analysis_async + + +#analysis task endpoint +class AnalysisView(APIView): + permission_classes = [IsAuthenticated] + + def post(self, request): + serializer = AnalysisRequestSerializer(data=request.data) + serializer.is_valid(raise_exception=True) #deserialize, check correct input and format, raises 400 Bad Request on fail + + task = AnalysisTask.objects.create( + user=request.user, #user + input_code=serializer.validated_data["code"], + language=serializer.validated_data["language"], + status="QUEUED" + ) + + run_analysis_async(str(task.id)) + + return Response({ + "task_id": str(task.id), + "status": task.status + }) + +#status endpoint +class StatusView(APIView): + permission_classes = [IsAuthenticated] + + def get(self, request, task_id): + task = AnalysisTask.objects.get(id=task_id, user=request.user) #user + + return Response({ + "status": task.status, + "summary": task.results if task.status == "COMPLETED" else None + }) \ No newline at end of file diff --git a/ai_payload.json b/ai_payload.json new file mode 100644 index 0000000..92b3004 --- /dev/null +++ b/ai_payload.json @@ -0,0 +1,103 @@ +{ + "schema": "ai_payload_v1", + "project_id": "121d4d1ff944c1642e8901fe9689b26811561e8437fe55d499cf4a9708c67e7d", + "input_path": "/Users/zhangtingen/Downloads/V/testquery.py", + "input_type": "file", + "pipeline_version": "1.0.0", + "normalized_findings": [ + { + "finding_id": "F-001", + "issue_type": "database_access_heuristic", + "title": "Database Access Pattern Detected", + "severity": "low", + "confidence": "low", + "finding_status": "review_needed", + "source_file": "testquery.py", + "evidence": [ + { + "line": 0, + "category": "database_access", + "snippet": "heuristic" + } + ], + "analysis_limitations": [ + "Heuristic only; line may be 0 when matched on whole file." + ] + }, + { + "finding_id": "F-002", + "issue_type": "sql_execution_review", + "title": "SQL Execution Present — Review for Injection / Unsafe Queries", + "severity": "medium", + "confidence": "medium", + "finding_status": "review_needed", + "source_file": "testquery.py", + "evidence": [ + { + "line": 7, + "category": "sql_execution", + "snippet": "result = conn.execute(text(\"SELECT NOW();\"))" + }, + { + "line": 13, + "category": "sql_execution", + "snippet": "#conn.execute(text(\"INSERT INTO users(email, password_hash) VALUES ('test2@example.com', '1A2B3C');\"))" + }, + { + "line": 14, + "category": "sql_execution", + "snippet": "result = conn.execute(text(\"SELECT * FROM users;\"))" + } + ] + }, + { + "finding_id": "F-003", + "issue_type": "potential_sensitive_data_exposure_via_debug_output", + "title": "Potential Sensitive Data Exposure via Debug Output", + "severity": "medium", + "confidence": "medium", + "finding_status": "review_needed", + "source_file": "testquery.py", + "evidence": [ + { + "line": 8, + "category": "debug_output", + "snippet": "print(\"Connected! Server time:\", result.fetchone()[0])" + }, + { + "line": 10, + "category": "debug_output", + "snippet": "print(\"Connection failed:\", e)" + }, + { + "line": 15, + "category": "debug_output", + "snippet": "print(result.fetchall())" + } + ], + "analysis_limitations": [ + "Cannot determine if printed data contains sensitive fields without data flow analysis." + ] + }, + { + "finding_id": "F-004", + "issue_type": "broad_exception_handling", + "title": "Broad Exception Handling", + "severity": "low", + "confidence": "medium", + "finding_status": "review_needed", + "source_file": "testquery.py", + "evidence": [ + { + "line": 9, + "category": "broad_except", + "snippet": "except Exception as e:" + } + ] + } + ], + "meta": { + "finding_count": 4, + "note": "Full source lives in preprocess output only; fetch by file_id/chunk_id if needed." + } +} \ No newline at end of file diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/config/asgi.py b/config/asgi.py new file mode 100644 index 0000000..39149a0 --- /dev/null +++ b/config/asgi.py @@ -0,0 +1,16 @@ +""" +ASGI config for config project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/5.0/howto/deployment/asgi/ +""" + +import os + +from django.core.asgi import get_asgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings') + +application = get_asgi_application() diff --git a/config/settings.py b/config/settings.py new file mode 100644 index 0000000..4a3bd85 --- /dev/null +++ b/config/settings.py @@ -0,0 +1,125 @@ +""" +Django settings for config project. + +Generated by 'django-admin startproject' using Django 5.0.3. + +For more information on this file, see +https://docs.djangoproject.com/en/5.0/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/5.0/ref/settings/ +""" + +from pathlib import Path + +# Build paths inside the project like this: BASE_DIR / 'subdir'. +BASE_DIR = Path(__file__).resolve().parent.parent + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/5.0/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = 'django-insecure-y+j3zht6sr%!!2fg0&-ek^21&)yc+y+5a*-ly+@16$8$px)a$@' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [] + + +# Application definition + +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'GroupFive', + 'rest_framework' +] + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'config.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'config.wsgi.application' + + +# Database +# https://docs.djangoproject.com/en/5.0/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': BASE_DIR / 'db.sqlite3', + } +} + + +# Password validation +# https://docs.djangoproject.com/en/5.0/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/5.0/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'UTC' + +USE_I18N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/5.0/howto/static-files/ + +STATIC_URL = 'static/' + +# Default primary key field type +# https://docs.djangoproject.com/en/5.0/ref/settings/#default-auto-field + +DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' diff --git a/config/urls.py b/config/urls.py new file mode 100644 index 0000000..a770eb4 --- /dev/null +++ b/config/urls.py @@ -0,0 +1,26 @@ +""" +URL configuration for config project. + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/5.0/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" +from django.contrib import admin +from django.urls import path + +from GroupFive.views import AnalysisView + +urlpatterns = [ + path('admin/', admin.site.urls), + path('api/GroupFive/', AnalysisView.as_view(), name='GroupFive'), + path('api/GroupFive/', ) +] diff --git a/config/wsgi.py b/config/wsgi.py new file mode 100644 index 0000000..c0a9631 --- /dev/null +++ b/config/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for config project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/5.0/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings') + +application = get_wsgi_application() diff --git a/db.py b/db.py new file mode 100644 index 0000000..5afc09b --- /dev/null +++ b/db.py @@ -0,0 +1,21 @@ +import os +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker +from dotenv import load_dotenv + +load_dotenv() + +DB_URL = f"postgresql://" \ + f"{os.getenv('DB_USER')}:" \ + f"{os.getenv('DB_PASS')}@" \ + f"{os.getenv('DB_HOST')}:" \ + f"{os.getenv('DB_PORT')}/" \ + f"{os.getenv('DB_NAME')}" + +engine = create_engine( + DB_URL, + echo=True, + connect_args={"sslmode": "require"} +) + +SessionLocal = sessionmaker(bind=engine) \ No newline at end of file diff --git a/front-end/db_queries.html b/front-end/db_queries.html new file mode 100644 index 0000000..f840170 --- /dev/null +++ b/front-end/db_queries.html @@ -0,0 +1,32 @@ + + + + + + + + + Hello World! + + + + + + + + + + + + +
+
+ + + + + + + + + \ No newline at end of file diff --git a/front-end/index.html b/front-end/index.html index 3a3ade3..f824cb1 100644 --- a/front-end/index.html +++ b/front-end/index.html @@ -1,25 +1,116 @@ - - + AutoPen Dashboard + + + + Hello World! - - + + - - - - - - + + +
+
+

Penetration Testing Dashboard

+
System Status: Active
+
+ +
+
+

Total Scans

+

128

+
+
+

Critical Vulnerabilities

+

12

+
+
+

Medium Vulnerabilities

+

34

+
+
+

Low Vulnerabilities

+

56

+
+
- \ No newline at end of file + +
+
+

Upload Code for Analysis

+ +
+ + +
+ + +
+ +
+ + +
+ +
+ + +
+
+ + + +
+

Recent Scan Results

+ + + + + + + + + + + + + + + + + + + + + + + +
TargetDateRisk LevelStatus
example.com02/14/2026CriticalCompleted
test-server.net02/12/2026LowCompleted
+
+
+ + + diff --git a/front-end/login.html b/front-end/login.html new file mode 100644 index 0000000..425f5a7 --- /dev/null +++ b/front-end/login.html @@ -0,0 +1,30 @@ + + + + Login + + + +

Login

+ +{% if error %} +

{{ error }}

+{% endif %} + +
+ {% csrf_token %} + + +

+ + +

+ + +
+ +

Don't have an account?

+Register + + + \ No newline at end of file diff --git a/front-end/register.html b/front-end/register.html new file mode 100644 index 0000000..18bf32c --- /dev/null +++ b/front-end/register.html @@ -0,0 +1,27 @@ + + + + Register + + + +

Create Account

+ +
+ {% csrf_token %} + + +

+ + +

+ + + +
+ +
+Back to Login + + + \ No newline at end of file diff --git a/front-end/scripts/main.js b/front-end/scripts/main.js index feeebf0..fc76eff 100644 --- a/front-end/scripts/main.js +++ b/front-end/scripts/main.js @@ -6,3 +6,26 @@ function operate(operator) { document.querySelector('#output').innerText = result; }); } + +function loadUsers() { + eel.showUsers()(users => { + document.querySelector('#output').innerText = JSON.stringify(users, null, 2); + }); +} + +function addUsers() { + var email = document.querySelector('#email').value; + var password = document.querySelector('#pass').value; + + eel.addUsers(email, password)(response => { + document.querySelector('#output').innerText = response; + }); +function askGPT() { + const prompt = document.querySelector('#prompt').value; + + document.querySelector('#output').innerText = "Loading..."; + + let newVar = eel.ask_api(prompt)(result => { + document.querySelector('#output').innerText = result; + });}} + diff --git a/front-end/scripts/upload.js b/front-end/scripts/upload.js new file mode 100644 index 0000000..4d33a0b --- /dev/null +++ b/front-end/scripts/upload.js @@ -0,0 +1,14 @@ +const tabButtons = document.querySelectorAll(".tab-btn"); +const tabContents = document.querySelectorAll(".tab-content"); + +tabButtons.forEach(button => { + button.addEventListener("click", () => { + // Remove active state + tabButtons.forEach(btn => btn.classList.remove("active")); + tabContents.forEach(tab => tab.classList.remove("active")); + + // Activate selected tab + button.classList.add("active"); + document.getElementById(button.dataset.tab).classList.add("active"); + }); +}); \ No newline at end of file diff --git a/front-end/styles/style.css b/front-end/styles/style.css index e69de29..8357d12 100644 --- a/front-end/styles/style.css +++ b/front-end/styles/style.css @@ -0,0 +1,245 @@ +/* ================= RESET ================= */ +* { + margin: 0; + padding: 0; + box-sizing: border-box; + font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; +} + +body { + background: #000000; + color: #e5e7eb; + display: flex; + min-height: 100vh; +} + +/* ================= SIDEBAR ================= */ +.sidebar { + width: 250px; + height: 100vh; + background: rgba(17, 24, 39, 0.8); + padding: 20px; + border-right: 1px solid rgba(168, 85, 247, 0.2); +} + +.sidebar h2 { + color: #e9d5ff; + margin-bottom: 40px; + text-align: center; +} + +.sidebar ul { + list-style: none; +} + +.sidebar ul li { + padding: 15px; + margin: 10px 0; + background: rgba(17, 24, 39, 0.8); + border: 1px solid rgba(168, 85, 247, 0.2); + border-radius: 8px; + cursor: pointer; + transition: 0.25s; +} + +.sidebar ul li:hover { + background: rgba(17, 24, 39, 0.95); +} + +/* ================= MAIN ================= */ +.main { + flex: 1; + padding: 40px; +} + +/* ================= HEADER ================= */ +.header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 40px; +} + +.header h1 { + color: #ffffff; +} + +.status { + background: rgba(17, 24, 39, 0.8); + padding: 10px 20px; + border-radius: 999px; + border: 1px solid rgba(168, 85, 247, 0.2); +} + +/* ================= UPLOAD SECTION ================= */ +.code-center { + margin-bottom: 50px; +} + +.code-box { + background: rgba(17, 24, 39, 0.8); + border: 1px solid rgba(168, 85, 247, 0.2); + border-radius: 12px; + padding: 30px; + width: 100%; + max-width: 900px; +} + +.code-box h2 { + color: #ffffff; + margin-bottom: 20px; +} + +/* ================= TABS ================= */ +.upload-tabs { + display: flex; + gap: 12px; + margin-bottom: 20px; +} + +.tab-btn { + background: transparent; + border: 1px solid rgba(168, 85, 247, 0.2); + color: #e9d5ff; + padding: 8px 22px; + border-radius: 999px; + cursor: pointer; + transition: 0.25s; +} + +.tab-btn:hover { + background: rgba(168, 85, 247, 0.15); +} + +.tab-btn.active { + background: rgba(168, 85, 247, 0.25); +} + +/* ================= TAB CONTENT ================= */ +.tab-content { + display: none; +} + +.tab-content.active { + display: block; +} + +/* ================= DROP ZONE ================= */ +.drop-zone { + display: block; + padding: 50px; + border: 2px dashed rgba(168, 85, 247, 0.8); + border-radius: 12px; + background: rgba(17, 24, 39, 0.6); + text-align: center; + cursor: pointer; + transition: 0.25s; + width: 80%; + max-width: 600px; + margin: 0 auto; +} + +.drop-zone:hover { + background: rgba(17, 24, 39, 0.9); +} + +.drop-zone p { + font-size: 18px; + margin-bottom: 6px; +} + +.drop-zone span { + font-size: 14px; + color: #c4b5fd; +} + +/* ================= TEXTAREA ================= */ +textarea { + width: 100%; + min-height: 220px; + background: rgba(17, 24, 39, 0.6); + color: #ffffff; + border: 1px solid rgba(168, 85, 247, 0.2); + border-radius: 8px; + padding: 15px; + resize: vertical; +} + +/* ================= BUTTON ================= */ +.scan-btn { + margin-top: 20px; + padding: 10px 30px; + background: rgba(168, 85, 247, 0.85); + color: white; + border: none; + border-radius: 999px; + cursor: pointer; + transition: 0.25s; +} + +.scan-btn:hover { + background: rgba(168, 85, 247, 1); +} + +/* ================= CARDS ================= */ +.cards { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); + gap: 20px; + margin-bottom: 40px; +} + +.card { + background: rgba(17, 24, 39, 0.8); + border: 1px solid rgba(168, 85, 247, 0.2); + padding: 20px; + border-radius: 12px; + transition: 0.25s; +} + +.card:hover { + background: rgba(17, 24, 39, 0.95); +} + +.card h3 { + color: #ffffff; + margin-bottom: 10px; +} + +.card p { + font-size: 28px; + font-weight: bold; +} + +/* ================= TABLE ================= */ +.recent-scans { + background: rgba(17, 24, 39, 0.8); + border: 1px solid rgba(168, 85, 247, 0.2); + padding: 20px; + border-radius: 12px; +} + +table { + width: 100%; + border-collapse: collapse; + margin-top: 15px; +} + +th, td { + padding: 12px; + text-align: left; +} + +th { + color: #e9d5ff; + border-bottom: 1px solid rgba(168, 85, 247, 0.2); +} + +tr:hover { + background: rgba(168, 85, 247, 0.05); +} + +/* ================= SEVERITY COLORS ================= */ +.critical { color: #fb7185; font-weight: bold; } +.medium { color: #facc15; font-weight: bold; } +.low { color: #34d399; font-weight: bold; } diff --git a/main.py b/main.py index cedf85d..1aef0c6 100644 --- a/main.py +++ b/main.py @@ -1,16 +1,72 @@ import eel - +from db import engine +from sqlalchemy import text +import bcrypt +import os +from dotenv import load_dotenv +from openai import OpenAI +#sofia +#jacob +#tim +#Nathan +#Sid eel.init('front-end') +try: + with engine.connect() as conn: + # Run a simple query to test + result = conn.execute(text("SELECT NOW();")) + print("Connected! Server time:", result.fetchone()[0]) +except Exception as e: + print("Connection failed:", e) +load_dotenv() +print("API key loaded:", bool(os.getenv("OPENAI_API_KEY"))) +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +@eel.expose +def ask_api(user_text): + print("ask_api received:", user_text) + + resp = client.chat.completions.create( + model="gpt-4.1-mini", + messages=[{"role": "user", "content": str(user_text)}], + ) + + answer = resp.choices[0].message.content + print("ask_api answer:", answer) + return answer + + + + @eel.expose def add(num1, num2): return int(num1) + int(num2) - @eel.expose def subtract(num1, num2): return int(num1) - int(num2) +@eel.expose +def showUsers(): + with engine.connect() as conn: + result = conn.execute(text("SELECT * FROM users;")) + users = result.fetchall() + return [dict(row._mapping) for row in users] + +@eel.expose +def addUsers(email, password): + #hashing logic here + hashed = bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode() + + with engine.begin() as conn: # auto-commit + conn.execute( + text("INSERT INTO users (email, password_hash) VALUES (:email, :password)"), + {"email": email, "password": hashed} + ) + return "User added successfully" + -eel.start('index.html', size=(1000, 600)) +if __name__ == "__main__": + eel.start('index.html', size=(1000, 600), mode='safari') diff --git a/manage.py b/manage.py new file mode 100644 index 0000000..8e7ac79 --- /dev/null +++ b/manage.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +"""Django's command-line utility for administrative tasks.""" +import os +import sys + + +def main(): + """Run administrative tasks.""" + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings') + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + execute_from_command_line(sys.argv) + + +if __name__ == '__main__': + main() diff --git a/preprocess/__init__.py b/preprocess/__init__.py new file mode 100644 index 0000000..3716dcb --- /dev/null +++ b/preprocess/__init__.py @@ -0,0 +1,26 @@ +""" +Backend preprocessing pipeline for secure code review. +Produces structured JSON (project/file/chunk) without frontend. +""" + +from preprocess.pipeline import run_file, run_snippet, PIPELINE_VERSION +from preprocess.normalized_findings import ( + normalize_preprocess_output, + normalize_file_findings, + run_file_with_findings, + export_ai_payload, + run_file_ai_payload, + slim_finding_for_ai, +) + +__all__ = [ + "run_file", + "run_snippet", + "PIPELINE_VERSION", + "normalize_preprocess_output", + "normalize_file_findings", + "run_file_with_findings", + "export_ai_payload", + "run_file_ai_payload", + "slim_finding_for_ai", +] diff --git a/preprocess/chunking.py b/preprocess/chunking.py new file mode 100644 index 0000000..9fd74b7 --- /dev/null +++ b/preprocess/chunking.py @@ -0,0 +1,119 @@ +""" +Build chunks: whole file, or per function/class, or line-based fallback. +""" + +from typing import Any, Dict, List + +MAX_LINES_WHOLE_CHUNK = 200 +LINE_CHUNK_SIZE = 120 +LINE_CHUNK_OVERLAP = 10 + + +def lines_to_content(lines: List[str], start: int, end: int) -> str: + """start/end are 1-based inclusive line numbers.""" + if start < 1: + start = 1 + if end > len(lines): + end = len(lines) + return "\n".join(lines[start - 1 : end]) + + +def build_chunks( + file_id: str, + content: str, + language: str, + structure: Dict[str, Any], +) -> List[Dict[str, Any]]: + """ + Returns list of chunk dicts with chunk_id, file_id, start_line, end_line, type, symbol, content. + chunk_id filled by caller after content_hash. + """ + lines = content.splitlines() + n = len(lines) + chunks: List[Dict[str, Any]] = [] + + if language == "python" and structure.get("parse_ok"): + # Prefer function/class spans + spans = [] + for f in structure.get("functions", []): + spans.append(("function", f["name"], f["line"], f.get("end_line", f["line"]))) + for c in structure.get("classes", []): + spans.append(("class", c["name"], c["line"], c.get("end_line", c["line"]))) + spans.sort(key=lambda x: x[2]) + if spans and n > MAX_LINES_WHOLE_CHUNK: + covered = set() + for typ, sym, start, end in spans: + if end < start: + end = start + chunk_content = lines_to_content(lines, start, end) + if not chunk_content.strip(): + continue + chunks.append({ + "file_id": file_id, + "start_line": start, + "end_line": end, + "type": typ, + "symbol": sym, + "content": chunk_content, + }) + for ln in range(start, end + 1): + covered.add(ln) + # Optional: add line-based for uncovered regions — keep simple: if no spans cover whole file, fallback + if not chunks: + pass + if not chunks and n <= MAX_LINES_WHOLE_CHUNK: + chunks.append({ + "file_id": file_id, + "start_line": 1, + "end_line": n, + "type": "file", + "symbol": None, + "content": content, + }) + elif not chunks: + chunks.extend(_line_chunks(file_id, lines)) + else: + if n <= MAX_LINES_WHOLE_CHUNK: + chunks.append({ + "file_id": file_id, + "start_line": 1, + "end_line": n, + "type": "file", + "symbol": None, + "content": content, + }) + else: + chunks.extend(_line_chunks(file_id, lines)) + + if not chunks and content.strip(): + chunks.append({ + "file_id": file_id, + "start_line": 1, + "end_line": max(n, 1), + "type": "file", + "symbol": None, + "content": content, + }) + return chunks + + +def _line_chunks(file_id: str, lines: List[str]) -> List[Dict[str, Any]]: + out = [] + n = len(lines) + i = 0 + while i < n: + start = i + 1 + end = min(i + LINE_CHUNK_SIZE, n) + block = "\n".join(lines[i:end]) + out.append({ + "file_id": file_id, + "start_line": start, + "end_line": end, + "type": "lines", + "symbol": None, + "content": block, + }) + i = end - LINE_CHUNK_OVERLAP if end - LINE_CHUNK_OVERLAP > i else end + if i >= n: + break + return out diff --git a/preprocess/filters.py b/preprocess/filters.py new file mode 100644 index 0000000..d828632 --- /dev/null +++ b/preprocess/filters.py @@ -0,0 +1,68 @@ +""" +Skip binary/large/minified paths and irrelevant directories. +""" + +from pathlib import Path +from typing import Optional, Tuple + +SKIP_DIR_NAMES = { + "node_modules", + "venv", + ".venv", + ".git", + "dist", + "build", + "__pycache__", + ".next", + "target", + ".idea", + ".vscode", +} + +BINARY_EXTENSIONS = { + ".png", ".jpg", ".jpeg", ".gif", ".webp", ".ico", ".bmp", + ".pdf", ".zip", ".tar", ".gz", ".7z", ".rar", + ".so", ".dll", ".dylib", ".exe", ".bin", + ".pyc", ".pyo", ".class", ".o", ".a", + ".woff", ".woff2", ".ttf", ".eot", + ".mp3", ".mp4", ".webm", ".avi", +} + +MINIFIED_NAME_SUFFIXES = (".min.js", ".min.css") +GENERATED_NAME_PARTS = ("-generated", "_generated", ".generated.") + + +def should_skip_path(path: Path, max_bytes: int = 512 * 1024) -> Tuple[bool, Optional[str]]: + """ + Returns (skip, reason). reason is None if not skipped. + """ + if not path.exists(): + return True, "not_found" + if path.is_dir(): + if path.name in SKIP_DIR_NAMES: + return True, f"skip_dir:{path.name}" + return False, None + suffix = path.suffix.lower() + if suffix in BINARY_EXTENSIONS: + return True, f"binary_ext:{suffix}" + try: + size = path.stat().st_size + except OSError: + return True, "stat_failed" + if size > max_bytes: + return True, f"too_large:{size}" + name = path.name.lower() + for s in MINIFIED_NAME_SUFFIXES: + if name.endswith(s): + return True, "minified_name" + for part in GENERATED_NAME_PARTS: + if part in name: + return True, "generated_name" + return False, None + + +def path_has_skip_segment(path: Path) -> bool: + for part in path.parts: + if part in SKIP_DIR_NAMES: + return True + return False diff --git a/preprocess/hashing.py b/preprocess/hashing.py new file mode 100644 index 0000000..b7e944c --- /dev/null +++ b/preprocess/hashing.py @@ -0,0 +1,15 @@ +import hashlib +import uuid +from pathlib import Path + + +def sha256_text(s: str) -> str: + return hashlib.sha256(s.encode("utf-8")).hexdigest() + + +def sha256_bytes(b: bytes) -> str: + return hashlib.sha256(b).hexdigest() + + +def new_uuid() -> str: + return str(uuid.uuid4()) diff --git a/preprocess/io_read.py b/preprocess/io_read.py new file mode 100644 index 0000000..6f0862d --- /dev/null +++ b/preprocess/io_read.py @@ -0,0 +1,35 @@ +""" +Read file as text: UTF-8 first, then fallbacks. Detect unreadable/binary. +""" + +from pathlib import Path +from typing import Optional, Tuple + +ENCODING_FALLBACKS = ["utf-8", "utf-8-sig", "latin-1", "cp1252"] + + +def read_text_with_fallback(path: Path) -> Tuple[Optional[str], Optional[str], Optional[str]]: + """ + Returns (text, encoding_used, error). + If unreadable, text is None and error explains why. + """ + raw = path.read_bytes() + if b"\x00" in raw[:8192] and raw[:8192].count(b"\x00") > 2: + return None, None, "likely_binary_null_bytes" + for enc in ENCODING_FALLBACKS: + try: + text = raw.decode(enc) + return text, enc, None + except UnicodeDecodeError: + continue + return None, None, "decode_failed_all_encodings" + + +def is_probably_text(s: str, sample_lines: int = 50) -> bool: + """Heuristic: too many non-printable chars => skip.""" + sample = "\n".join(s.splitlines()[:sample_lines]) + if not sample.strip(): + return True + printable = sum(1 for c in sample if c.isprintable() or c in "\n\r\t") + ratio = printable / max(len(sample), 1) + return ratio >= 0.85 diff --git a/preprocess/language.py b/preprocess/language.py new file mode 100644 index 0000000..0610b53 --- /dev/null +++ b/preprocess/language.py @@ -0,0 +1,105 @@ +""" +Language detection: extension first, then shebang/keywords/patterns. +""" + +import re +from pathlib import Path +from typing import Dict, Optional + +EXT_TO_LANGUAGE = { + ".py": "python", + ".js": "javascript", + ".mjs": "javascript", + ".ts": "typescript", + ".tsx": "typescript", + ".jsx": "javascript", + ".java": "java", + ".go": "go", + ".rb": "ruby", + ".php": "php", + ".cs": "csharp", + ".rs": "rust", + ".c": "c", + ".cpp": "cpp", + ".h": "c", + ".sql": "sql", + ".sh": "shell", + ".bash": "shell", + ".yaml": "yaml", + ".yml": "yaml", + ".json": "json", + ".html": "html", + ".css": "css", +} + +SHEBANG_PATTERN = re.compile(r"^#!\s*/usr/bin/env\s+(\w+)|^#!\s*/.*\b(python|node|ruby)\b", re.MULTILINE) + +KEYWORD_HINTS = [ + (r"\bdef\s+\w+\s*\(", "python"), + (r"\bimport\s+\w+", "python"), + (r"\bfrom\s+\w+\s+import\b", "python"), + (r"\bfunction\s+\w+\s*\(", "javascript"), + (r"\bconst\s+\w+\s*=\s*\(", "javascript"), + (r"\brequire\s*\(", "javascript"), + (r"\bpublic\s+class\s+\w+", "java"), + (r"\bpackage\s+main\b", "go"), + (r"\bfn\s+main\s*\(", "rust"), +] + + +def detect_language(path: Optional[Path], content: str, hint: Optional[str] = None) -> Dict: + """ + Returns { "language", "confidence": high|medium|low|unknown, "reason": str }. + """ + reasons = [] + lang_from_ext = None + if path and path.suffix: + lang_from_ext = EXT_TO_LANGUAGE.get(path.suffix.lower()) + if lang_from_ext: + reasons.append(f"extension:{path.suffix}") + + if hint: + h = hint.strip().lower() + if h in ("py", "python"): + lang_from_ext = lang_from_ext or "python" + reasons.append("hint:python") + elif h in ("js", "javascript"): + lang_from_ext = lang_from_ext or "javascript" + reasons.append("hint:javascript") + + lang_from_content = None + first_lines = "\n".join(content.splitlines()[:30]) + m = SHEBANG_PATTERN.search(first_lines) + if m: + g = (m.group(1) or m.group(2) or "").lower() + if "python" in g: + lang_from_content = "python" + elif "node" in g: + lang_from_content = "javascript" + elif "ruby" in g: + lang_from_content = "ruby" + if lang_from_content: + reasons.append("shebang") + + if not lang_from_content: + for pattern, lang in KEYWORD_HINTS: + if re.search(pattern, first_lines): + lang_from_content = lang + reasons.append(f"keyword:{pattern[:20]}") + break + + if lang_from_ext and lang_from_content: + if lang_from_ext == lang_from_content: + return {"language": lang_from_ext, "confidence": "high", "reason": ";".join(reasons)} + return { + "language": lang_from_ext, + "confidence": "medium", + "reason": f"extension_vs_content_conflict;{';'.join(reasons)}", + } + if lang_from_ext: + return {"language": lang_from_ext, "confidence": "medium", "reason": ";".join(reasons)} + if lang_from_content: + return {"language": lang_from_content, "confidence": "medium", "reason": ";".join(reasons)} + if path and path.suffix: + return {"language": "unknown", "confidence": "low", "reason": f"unknown_ext:{path.suffix}"} + return {"language": "unknown", "confidence": "unknown", "reason": "no_extension_no_keywords"} diff --git a/preprocess/normalized_findings.py b/preprocess/normalized_findings.py new file mode 100644 index 0000000..edef5ad --- /dev/null +++ b/preprocess/normalized_findings.py @@ -0,0 +1,377 @@ +""" +Normalize preprocess signals into structured findings for AI / report stage. +No human layer: finding_status is review_needed; confidence reflects heuristic only. +""" + +from typing import Any, Dict, List, Optional + +# Signal bucket key (as in file_record["signals"]) -> default issue template +# category inside each item is the regex category e.g. broad_except, debug_output +SIGNAL_BUCKET_META = { + "error_handling": { + "issue_type": "broad_exception_handling", + "title": "Broad Exception Handling", + "affected_component": ["backend", "error_handling"], + "confidence": "medium", + "severity": "low", + "remediation_keywords": [ + "catch specific exceptions", + "structured logging", + "safer exception handling", + ], + "possible_impact": [ + "reduced error observability", + "generic failure handling may hide root causes", + ], + }, + "debug_output": { + "issue_type": "potential_sensitive_data_exposure_via_debug_output", + "title": "Potential Sensitive Data Exposure via Debug Output", + "affected_component": ["backend", "logging"], + "confidence": "medium", + "severity": "medium", + "remediation_keywords": [ + "remove debug prints", + "sanitize logs", + "avoid dumping full query results", + ], + "possible_impact": [ + "internal data may appear in console or logs", + "debug output could leak in production", + ], + "analysis_limitations": [ + "Cannot determine if printed data contains sensitive fields without data flow analysis.", + ], + }, + "sql_execution": { + "issue_type": "sql_execution_review", + "title": "SQL Execution Present — Review for Injection / Unsafe Queries", + "affected_component": ["backend", "database"], + "confidence": "medium", + "severity": "medium", + "remediation_keywords": [ + "parameterized queries", + "prepared statements", + "avoid string concatenation in SQL", + ], + "possible_impact": [ + "SQL injection if input is concatenated into queries", + "unsafe query patterns", + ], + }, + "command_execution": { + "issue_type": "command_execution_surface", + "title": "Command Execution or Dynamic Evaluation", + "affected_component": ["backend", "process"], + "confidence": "high", + "severity": "high", + "remediation_keywords": [ + "avoid eval/exec", + "sanitize subprocess arguments", + "use allowlists for shell commands", + ], + "possible_impact": [ + "command injection", + "arbitrary code execution", + ], + }, + "file_access": { + "issue_type": "file_io_review", + "title": "File Access — Review Path Handling", + "affected_component": ["backend", "filesystem"], + "confidence": "low", + "severity": "low", + "remediation_keywords": [ + "validate paths", + "avoid path traversal", + ], + "possible_impact": [ + "path traversal if paths are user-controlled", + ], + }, + "user_input_sources": { + "issue_type": "user_input_flow", + "title": "User-Controlled Input Source", + "affected_component": ["backend", "input"], + "confidence": "medium", + "severity": "medium", + "remediation_keywords": [ + "validate and sanitize input", + "use safe APIs", + ], + "possible_impact": [ + "injection or logic flaws if input reaches sensitive sinks", + ], + }, + "possible_hardcoded_secrets": { + "issue_type": "possible_hardcoded_secret", + "title": "Possible Hardcoded Secret", + "affected_component": ["backend", "secrets"], + "confidence": "low", + "severity": "high", + "remediation_keywords": [ + "use environment variables or secret manager", + "rotate credentials", + ], + "possible_impact": [ + "credential leak if committed or logged", + ], + "analysis_limitations": [ + "May be test data or placeholders; verify context.", + ], + }, + "auth_related_keywords": { + "issue_type": "auth_surface_keyword", + "title": "Auth-Related Keyword Present", + "affected_component": ["backend", "auth"], + "confidence": "low", + "severity": "low", + "remediation_keywords": [ + "review auth flow", + "session handling", + ], + "possible_impact": [ + "auth logic may need manual review", + ], + }, + "database_access": { + "issue_type": "database_access_heuristic", + "title": "Database Access Pattern Detected", + "affected_component": ["backend", "database"], + "confidence": "low", + "severity": "low", + "remediation_keywords": [ + "least privilege", + "connection pooling security", + ], + "possible_impact": [ + "review how connections and queries are used", + ], + "analysis_limitations": [ + "Heuristic only; line may be 0 when matched on whole file.", + ], + }, + "crypto_usage": { + "issue_type": "crypto_usage_review", + "title": "Cryptographic API Usage", + "affected_component": ["backend", "crypto"], + "confidence": "low", + "severity": "medium", + "remediation_keywords": [ + "use vetted libraries", + "avoid weak algorithms", + ], + "possible_impact": [ + "misuse may weaken security", + ], + }, +} + + +def _evidence_from_signal_item(signal_key: str, index: int, item: Dict[str, Any]) -> Dict[str, Any]: + line = item.get("line", 0) + category = item.get("category", signal_key) + text = item.get("match", "") + if len(text) > 500: + text = text[:500] + "..." + return { + "line": line, + "signal": signal_key, + "category": category, + "text": text, + } + + +def _refs_for_bucket(signal_key: str, indices: List[int]) -> List[str]: + return [f"signals.{signal_key}[{i}]" for i in indices] + + +def normalize_file_findings( + file_record: Dict[str, Any], + file_index: int = 0, + starting_f_index: int = 1, +) -> List[Dict[str, Any]]: + """ + Build normalized_findings from a single preprocess file_record. + One finding per signal bucket that has entries (evidence = all items in bucket). + """ + signals = file_record.get("signals") or {} + path = file_record.get("path", f"file_{file_index}") + findings: List[Dict[str, Any]] = [] + f_num = starting_f_index + + for signal_key, items in signals.items(): + if not items: + continue + meta = SIGNAL_BUCKET_META.get(signal_key) + if not meta: + # Unknown bucket: generic finding + meta = { + "issue_type": f"heuristic_{signal_key}", + "title": f"Signal: {signal_key}", + "affected_component": ["backend"], + "confidence": "low", + "severity": "low", + "remediation_keywords": ["manual review"], + "possible_impact": ["pattern matched; context unknown"], + } + evidence = [] + refs = [] + for i, item in enumerate(items): + if not isinstance(item, dict): + continue + evidence.append(_evidence_from_signal_item(signal_key, i, item)) + refs.append(f"signals.{signal_key}[{i}]") + + if not evidence: + continue + + finding = { + "finding_id": f"F-{f_num:03d}", + "issue_type": meta["issue_type"], + "title": meta["title"], + "affected_component": list(meta["affected_component"]), + "confidence": meta["confidence"], + "severity": meta["severity"], + "finding_status": "review_needed", + "verification_method": "automated_heuristic", + "source_file": path, + "file_id": file_record.get("file_id"), + "possible_impact": list(meta.get("possible_impact", [])), + "evidence": evidence, + "remediation_keywords": list(meta.get("remediation_keywords", [])), + "source_signal_refs": refs, + } + if meta.get("analysis_limitations"): + finding["analysis_limitations"] = list(meta["analysis_limitations"]) + findings.append(finding) + f_num += 1 + + return findings + + +def normalize_preprocess_output(preprocess_output: Dict[str, Any]) -> Dict[str, Any]: + """ + Attach normalized_findings to full preprocess pipeline output. + """ + all_findings: List[Dict[str, Any]] = [] + f_num = 1 + for idx, file_record in enumerate(preprocess_output.get("files") or []): + batch = normalize_file_findings(file_record, file_index=idx, starting_f_index=f_num) + all_findings.extend(batch) + f_num += len(batch) + + return { + "pipeline_version": preprocess_output.get("pipeline_version"), + "project_id": preprocess_output.get("project_id"), + "created_at": preprocess_output.get("created_at"), + "input_type": preprocess_output.get("input_type"), + "input_path": preprocess_output.get("input_path"), + "normalized_findings": all_findings, + "normalized_findings_meta": { + "count": len(all_findings), + "generator": "preprocess.normalized_findings", + "note": "All findings are heuristic; finding_status is review_needed unless overridden downstream.", + }, + } + + +def slim_finding_for_ai( + finding: Dict[str, Any], + max_evidence_snippet: int = 120, + include_remediation: bool = False, +) -> Dict[str, Any]: + """ + Shrink one finding for LLM context: drop long text, optional remediation lists. + """ + out = { + "finding_id": finding.get("finding_id"), + "issue_type": finding.get("issue_type"), + "title": finding.get("title"), + "severity": finding.get("severity"), + "confidence": finding.get("confidence"), + "finding_status": finding.get("finding_status"), + "source_file": finding.get("source_file"), + } + slim_evidence = [] + for ev in finding.get("evidence") or []: + text = ev.get("text") or "" + if len(text) > max_evidence_snippet: + text = text[:max_evidence_snippet].rstrip() + "..." + slim_evidence.append({ + "line": ev.get("line"), + "category": ev.get("category"), + "snippet": text if text else None, + }) + out["evidence"] = slim_evidence + if include_remediation and finding.get("remediation_keywords"): + out["remediation_keywords"] = finding.get("remediation_keywords") + if finding.get("analysis_limitations"): + # Keep one line each to save tokens + out["analysis_limitations"] = [ + (s[:150] + "...") if len(s) > 150 else s + for s in finding.get("analysis_limitations", [])[:2] + ] + return out + + +def export_ai_payload( + preprocess_output: Dict[str, Any], + max_evidence_snippet: int = 120, +) -> Dict[str, Any]: + """ + Compact payload for AI only — no chunk content, no raw signals. + Preprocess full JSON stays separate (long); this is short for token budget. + """ + normalized = normalize_preprocess_output(preprocess_output) + slim_findings = [ + slim_finding_for_ai(f, max_evidence_snippet=max_evidence_snippet) + for f in normalized.get("normalized_findings") or [] + ] + return { + "schema": "ai_payload_v1", + "project_id": preprocess_output.get("project_id"), + "input_path": preprocess_output.get("input_path"), + "input_type": preprocess_output.get("input_type"), + "pipeline_version": preprocess_output.get("pipeline_version"), + "normalized_findings": slim_findings, + "meta": { + "finding_count": len(slim_findings), + "note": "Full source lives in preprocess output only; fetch by file_id/chunk_id if needed.", + }, + } + + +def run_file_ai_payload(path: str, max_evidence_snippet: int = 120) -> Dict[str, Any]: + """Preprocess then return only slim AI payload (no chunks, no duplicate signals).""" + from preprocess.pipeline import run_file + + pre = run_file(path) + if not pre.get("files"): + return { + "schema": "ai_payload_v1", + "project_id": pre.get("project_id"), + "input_path": pre.get("input_path"), + "normalized_findings": [], + "meta": {"finding_count": 0, "reason": "no files processed"}, + } + return export_ai_payload(pre, max_evidence_snippet=max_evidence_snippet) + + +def run_file_with_findings(path: str) -> Dict[str, Any]: + """Convenience: preprocess file then normalize (single import for CLI).""" + from preprocess.pipeline import run_file + + pre = run_file(path) + if not pre.get("files"): + return { + **pre, + "normalized_findings": [], + "normalized_findings_meta": {"count": 0, "reason": "no files processed"}, + } + normalized = normalize_preprocess_output(pre) + # Merge: keep full preprocess payload + normalized_findings at top level for AI + out = dict(pre) + out["normalized_findings"] = normalized["normalized_findings"] + out["normalized_findings_meta"] = normalized["normalized_findings_meta"] + return out diff --git a/preprocess/parse_python.py b/preprocess/parse_python.py new file mode 100644 index 0000000..b663297 --- /dev/null +++ b/preprocess/parse_python.py @@ -0,0 +1,74 @@ +""" +Extract imports, functions, classes from Python source via ast. +On failure returns empty structure for fallback chunking. +""" + +import ast +from typing import Any, Dict, List, Optional + + +def parse_python_structure(source: str) -> Dict[str, Any]: + out: Dict[str, Any] = { + "parse_ok": False, + "imports": [], + "functions": [], + "classes": [], + "calls": [], # simplified: names only, from ast.Call if possible + } + try: + tree = ast.parse(source) + except SyntaxError: + return out + out["parse_ok"] = True + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + out["imports"].append({ + "line": node.lineno, + "end_line": getattr(node, "end_lineno", node.lineno), + "name": alias.name, + "alias": alias.asname, + }) + elif isinstance(node, ast.ImportFrom): + mod = node.module or "" + for alias in node.names: + out["imports"].append({ + "line": node.lineno, + "end_line": getattr(node, "end_lineno", node.lineno), + "name": f"{mod}.{alias.name}" if mod else alias.name, + "alias": alias.asname, + }) + elif isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef): + end = getattr(node, "end_lineno", node.lineno) + out["functions"].append({ + "name": node.name, + "line": node.lineno, + "end_line": end, + }) + elif isinstance(node, ast.ClassDef): + end = getattr(node, "end_lineno", node.lineno) + out["classes"].append({ + "name": node.name, + "line": node.lineno, + "end_line": end, + }) + elif isinstance(node, ast.Call): + name = _call_name(node) + if name: + out["calls"].append({"line": node.lineno, "name": name}) + return out + + +def _call_name(node: ast.Call) -> Optional[str]: + if isinstance(node.func, ast.Name): + return node.func.id + if isinstance(node.func, ast.Attribute): + parts: List[str] = [] + cur = node.func + while isinstance(cur, ast.Attribute): + parts.append(cur.attr) + cur = cur.value + if isinstance(cur, ast.Name): + parts.append(cur.id) + return ".".join(reversed(parts)) + return None diff --git a/preprocess/pipeline.py b/preprocess/pipeline.py new file mode 100644 index 0000000..dfe9e5f --- /dev/null +++ b/preprocess/pipeline.py @@ -0,0 +1,211 @@ +""" +Orchestrates preprocessing: file or snippet -> structured JSON. +""" + +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, Optional, Union + +from preprocess.chunking import build_chunks +from preprocess.filters import should_skip_path, path_has_skip_segment +from preprocess.hashing import sha256_text +from preprocess.io_read import read_text_with_fallback, is_probably_text +from preprocess.language import detect_language +from preprocess.parse_python import parse_python_structure +from preprocess.signals import extract_signals, risk_hints_from_signals + +PIPELINE_VERSION = "1.0.0" + + +def _file_id(project_id: str, rel_path: str, content: str) -> str: + return sha256_text(f"{project_id}:{rel_path}:{sha256_text(content)}") + + +def _chunk_id(file_id: str, start: int, end: int, content: str) -> str: + return sha256_text(f"{file_id}:{start}:{end}:{sha256_text(content)}") + + +def run_file( + path: Union[str, Path], + language_hint: Optional[str] = None, + max_file_bytes: int = 512 * 1024, +) -> Dict[str, Any]: + path = Path(path).resolve() + skip, reason = should_skip_path(path, max_bytes=max_file_bytes) + project_id = sha256_text(str(path)) + if skip: + return { + "pipeline_version": PIPELINE_VERSION, + "project_id": project_id, + "created_at": datetime.now(timezone.utc).isoformat(), + "input_type": "file", + "input_path": str(path), + "files": [], + "files_skipped": [{"path": str(path), "reason": reason}], + "chunks": [], + } + if path_has_skip_segment(path): + return { + "pipeline_version": PIPELINE_VERSION, + "project_id": project_id, + "created_at": datetime.now(timezone.utc).isoformat(), + "input_type": "file", + "input_path": str(path), + "files": [], + "files_skipped": [{"path": str(path), "reason": "path_in_skip_dir"}], + "chunks": [], + } + + text, encoding, err = read_text_with_fallback(path) + if text is None or err: + return { + "pipeline_version": PIPELINE_VERSION, + "project_id": project_id, + "created_at": datetime.now(timezone.utc).isoformat(), + "input_type": "file", + "input_path": str(path), + "files": [], + "files_skipped": [{"path": str(path), "reason": err or "read_failed"}], + "chunks": [], + } + if not is_probably_text(text): + return { + "pipeline_version": PIPELINE_VERSION, + "project_id": project_id, + "created_at": datetime.now(timezone.utc).isoformat(), + "input_type": "file", + "input_path": str(path), + "files": [], + "files_skipped": [{"path": str(path), "reason": "not_probably_text"}], + "chunks": [], + } + + rel_path = path.name + lang_info = detect_language(path, text, language_hint) + structure: Dict[str, Any] = {} + if lang_info["language"] == "python": + structure = parse_python_structure(text) + + file_id = _file_id(project_id, rel_path, text) + content_hash = sha256_text(text) + + raw_chunks = build_chunks(file_id, text, lang_info["language"], structure) + chunks_out = [] + signals_full = extract_signals(text) + risk_hints_file = risk_hints_from_signals(signals_full) + + for ch in raw_chunks: + cid = _chunk_id(file_id, ch["start_line"], ch["end_line"], ch["content"]) + chunk_signals = extract_signals(ch["content"]) + chunks_out.append({ + "chunk_id": cid, + "file_id": file_id, + "start_line": ch["start_line"], + "end_line": ch["end_line"], + "type": ch["type"], + "symbol": ch["symbol"], + "content": ch["content"], + "content_hash": sha256_text(ch["content"]), + "signals": {k: v for k, v in chunk_signals.items() if v}, + "risk_hints": risk_hints_from_signals(chunk_signals), + }) + + file_record = { + "file_id": file_id, + "path": rel_path, + "language": lang_info["language"], + "language_confidence": lang_info["confidence"], + "language_reason": lang_info["reason"], + "encoding_used": encoding, + "line_count": len(text.splitlines()), + "byte_size": path.stat().st_size, + "content_hash": content_hash, + "parse_ok": structure.get("parse_ok", False), + "imports": structure.get("imports", []), + "functions": structure.get("functions", []), + "classes": structure.get("classes", []), + "calls_sample": structure.get("calls", [])[:50], + "signals": {k: v for k, v in signals_full.items() if v}, + "risk_hints": risk_hints_file, + } + + return { + "pipeline_version": PIPELINE_VERSION, + "project_id": project_id, + "created_at": datetime.now(timezone.utc).isoformat(), + "input_type": "file", + "input_path": str(path), + "files": [file_record], + "files_skipped": [], + "chunks": chunks_out, + } + + +def run_snippet( + content: str, + virtual_name: str = "snippet.py", + language_hint: Optional[str] = None, +) -> Dict[str, Any]: + """Treat snippet as a single virtual file.""" + path = Path(virtual_name) + project_id = sha256_text(content[:5000] + virtual_name) + lang_info = detect_language(path, content, language_hint) + structure = {} + if lang_info["language"] == "python": + structure = parse_python_structure(content) + file_id = _file_id(project_id, virtual_name, content) + raw_chunks = build_chunks(file_id, content, lang_info["language"], structure) + signals_full = extract_signals(content) + chunks_out = [] + for ch in raw_chunks: + cid = _chunk_id(file_id, ch["start_line"], ch["end_line"], ch["content"]) + chunk_signals = extract_signals(ch["content"]) + chunks_out.append({ + "chunk_id": cid, + "file_id": file_id, + "start_line": ch["start_line"], + "end_line": ch["end_line"], + "type": ch["type"], + "symbol": ch["symbol"], + "content": ch["content"], + "content_hash": sha256_text(ch["content"]), + "signals": {k: v for k, v in chunk_signals.items() if v}, + "risk_hints": risk_hints_from_signals(chunk_signals), + }) + file_record = { + "file_id": file_id, + "path": virtual_name, + "language": lang_info["language"], + "language_confidence": lang_info["confidence"], + "language_reason": lang_info["reason"], + "encoding_used": "utf-8", + "line_count": len(content.splitlines()), + "byte_size": len(content.encode("utf-8")), + "content_hash": sha256_text(content), + "parse_ok": structure.get("parse_ok", False), + "imports": structure.get("imports", []), + "functions": structure.get("functions", []), + "classes": structure.get("classes", []), + "calls_sample": structure.get("calls", [])[:50], + "signals": {k: v for k, v in signals_full.items() if v}, + "risk_hints": risk_hints_from_signals(signals_full), + } + return { + "pipeline_version": PIPELINE_VERSION, + "project_id": project_id, + "created_at": datetime.now(timezone.utc).isoformat(), + "input_type": "snippet", + "input_path": virtual_name, + "files": [file_record], + "files_skipped": [], + "chunks": chunks_out, + } + + +def run_file_to_json(path: Union[str, Path], out_path: Optional[str] = None) -> str: + data = run_file(path) + s = json.dumps(data, ensure_ascii=False, indent=2) + if out_path: + Path(out_path).write_text(s, encoding="utf-8") + return s diff --git a/preprocess/signals.py b/preprocess/signals.py new file mode 100644 index 0000000..094f8c3 --- /dev/null +++ b/preprocess/signals.py @@ -0,0 +1,93 @@ +""" +Regex-based security signals and risk_hints from source text. +""" + +import re +from typing import Dict, List, Set + +# Patterns -> signal category +PATTERNS = [ + (r"\bsubprocess\.(run|Popen|call)\b", "command_execution"), + (r"\bos\.system\s*\(", "command_execution"), + (r"\bexec\s*\(", "command_execution"), + (r"\beval\s*\(", "command_execution"), + (r"\.execute\s*\(\s*[\"']", "sql_execution"), + (r"\bcursor\.execute\s*\(", "sql_execution"), + (r"\btext\s*\(\s*[\"'].*SELECT|INSERT|UPDATE|DELETE", "sql_execution"), + (r"\bopen\s*\(", "file_access"), + (r"\bPath\s*\([^)]*\)\.(read|write)", "file_access"), + (r"\binput\s*\(", "user_input_source"), + (r"\brequest\.(args|form|json|get)\b", "user_input_source"), + (r"\bargv\b|\bsys\.argv\b", "user_input_source"), + (r"\bhashlib\.|bcrypt\.|crypto\.|jwt\.|openssl\b", "crypto_usage"), + (r"\bprint\s*\(", "debug_output"), + (r"\bconsole\.log\s*\(", "debug_output"), + (r"\bexcept\s+Exception\b|\bexcept\s*:", "broad_except"), + (r"(?i)(password|secret|api_key|apikey|token)\s*=\s*['\"][^'\"]{8,}", "possible_hardcoded_secret"), + (r"(?i)\b(auth|login|session|oauth|bearer)\b", "auth_keyword"), +] + + +def extract_signals(content: str) -> Dict[str, List[Dict]]: + """ + Returns dict with keys: imports (from regex), function_calls, user_input_sources, + database_access, sql_execution, command_execution, file_access, crypto_usage, + debug_output, error_handling, possible_hardcoded_secrets, auth_related_keywords. + Each value is a list of { "line", "match", "category" }. + """ + lines = content.splitlines() + by_category: Dict[str, List[Dict]] = { + "imports": [], + "function_calls": [], + "user_input_sources": [], + "database_access": [], + "sql_execution": [], + "command_execution": [], + "file_access": [], + "crypto_usage": [], + "debug_output": [], + "error_handling": [], + "possible_hardcoded_secrets": [], + "auth_related_keywords": [], + } + category_map = { + "command_execution": "command_execution", + "sql_execution": "sql_execution", + "file_access": "file_access", + "user_input_source": "user_input_sources", + "crypto_usage": "crypto_usage", + "debug_output": "debug_output", + "broad_except": "error_handling", + "possible_hardcoded_secret": "possible_hardcoded_secrets", + "auth_keyword": "auth_related_keywords", + } + for i, line in enumerate(lines, start=1): + for pattern, cat in PATTERNS: + if re.search(pattern, line): + key = category_map.get(cat, cat) + if key not in by_category: + by_category[key] = [] + by_category[key].append({"line": i, "match": line.strip()[:200], "category": cat}) + # DB access heuristic + if re.search(r"\bengine\.connect\b|\bconnection\b|\bdatabase\b", content, re.I): + by_category["database_access"].append({"line": 0, "match": "heuristic", "category": "database_access"}) + return by_category + + +def risk_hints_from_signals(signals: Dict[str, List]) -> List[str]: + hints: Set[str] = set() + if signals.get("sql_execution"): + hints.add("sql_execution_present") + if signals.get("command_execution"): + hints.add("command_execution_present") + if signals.get("possible_hardcoded_secrets"): + hints.add("possible_hardcoded_secret") + if signals.get("user_input_sources"): + hints.add("user_input_flow") + if signals.get("file_access"): + hints.add("file_io") + if signals.get("broad_except") or signals.get("error_handling"): + hints.add("error_handling_review") + if signals.get("auth_related_keywords"): + hints.add("auth_surface") + return sorted(hints) diff --git a/prescan.py b/prescan.py new file mode 100644 index 0000000..e6d51ca --- /dev/null +++ b/prescan.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +import argparse +import json +import os +import subprocess +import sys +from pathlib import Path + + +def run_semgrep(target_path: str) -> dict: + + path = Path(target_path).resolve() + if not path.exists(): + return {"tool": "semgrep", "error": f"path not found: {target_path}", "results": []} + try: + cmd = [ + sys.executable, "-m", "semgrep", "scan", + "--config", "auto", + "--json", + "--quiet", + str(path), + ] + out = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, + cwd=os.getcwd(), + ) + if out.returncode != 0 and not out.stdout.strip(): + return { + "tool": "semgrep", + "error": out.stderr.strip() or f"exit code {out.returncode}", + "results": [], + } + data = json.loads(out.stdout) if out.stdout.strip() else {} + return {"tool": "semgrep", "error": None, "results": data.get("results", data)} + except FileNotFoundError: + return {"tool": "semgrep", "error": "semgrep not installed (pip install semgrep)", "results": []} + except subprocess.TimeoutExpired: + return {"tool": "semgrep", "error": "timeout", "results": []} + except json.JSONDecodeError as e: + return {"tool": "semgrep", "error": str(e), "results": []} + + +def run_gitleaks(target_path: str) -> dict: + """Run gitleaks detect on target_path and return JSON results. Empty structure if unavailable.""" + path = Path(target_path).resolve() + if not path.exists(): + return {"tool": "gitleaks", "error": f"path not found: {target_path}", "results": []} + source = str(path) if path.is_dir() else str(path.parent) + try: + cmd = [ + "gitleaks", "detect", + "--source", source, + "--no-git", + "--report-format", "json", + "--report-path", "-", # write JSON to stdout + ] + out = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=120, + ) + # when gitleaks finds secrets exit code may be 1, but stdout still contains JSON + raw = out.stdout.strip() + if not raw: + return {"tool": "gitleaks", "error": None, "results": []} + try: + data = json.loads(raw) + results = data if isinstance(data, list) else data.get("findings", data.get("results", [])) + except json.JSONDecodeError: + results = [] + return {"tool": "gitleaks", "error": None, "results": results} + except FileNotFoundError: + return {"tool": "gitleaks", "error": "gitleaks not installed", "results": []} + except subprocess.TimeoutExpired: + return {"tool": "gitleaks", "error": "timeout", "results": []} + + +def main(): + parser = argparse.ArgumentParser(description="Pre-scan: semgrep + gitleaks -> JSON report") + parser.add_argument( + "input_path", + nargs="?", + default=".", + help="File or directory path to scan (default: current directory)", + ) + parser.add_argument( + "-o", "--output", + default="prescan_report.json", + help="Output JSON file path (default: prescan_report.json)", + ) + args = parser.parse_args() + + input_path = os.path.normpath(args.input_path) + if not os.path.exists(input_path): + print(f"Error: path not found {input_path}", file=sys.stderr) + sys.exit(1) + + report = { + "input_path": os.path.abspath(input_path), + "semgrep": run_semgrep(input_path), + "gitleaks": run_gitleaks(input_path), + } + + out_path = args.output + with open(out_path, "w", encoding="utf-8") as f: + json.dump(report, f, ensure_ascii=False, indent=2) + + print(f"Report written to: {out_path}") + + +if __name__ == "__main__": + main() diff --git a/prescan_report.json b/prescan_report.json new file mode 100644 index 0000000..63f87d9 --- /dev/null +++ b/prescan_report.json @@ -0,0 +1,13 @@ +{ + "input_path": "/Users/zhangtingen/Downloads/V/testquery.py", + "semgrep": { + "tool": "semgrep", + "error": "Using `python -m semgrep` to run Semgrep is deprecated as of 1.38.0. Please simply run `semgrep` instead.", + "results": [] + }, + "gitleaks": { + "tool": "gitleaks", + "error": "gitleaks not installed", + "results": [] + } +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index e7c13db..aa31d5f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,11 @@ eel +engine pyqrcode pyinstaller pypng autopep8 +psycopg2-binary +SQLAlchemy +python-dotenv +bcrypt +semgrep diff --git a/run_preprocess_test.py b/run_preprocess_test.py new file mode 100644 index 0000000..6709079 --- /dev/null +++ b/run_preprocess_test.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +Run preprocessing and write JSON. + +By default writes the LLM-friendly payload only (compact, no chunk content). +Use --full for complete preprocess output (scanners / cache). + +Usage: + python3 run_preprocess_test.py # -> ai_payload.json (LLM) + python3 run_preprocess_test.py -o out.json # LLM payload to out.json + python3 run_preprocess_test.py --full # full preprocess JSON + python3 run_preprocess_test.py --full -o pre.json + python3 run_preprocess_test.py other.py -o x.json +""" + +import argparse +import json +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parent +sys.path.insert(0, str(ROOT)) + +from preprocess.pipeline import run_file # noqa: E402 +from preprocess.normalized_findings import run_file_ai_payload # noqa: E402 + +DEFAULT_INPUT = ROOT / "testquery.py" +DEFAULT_OUTPUT_LLM = "ai_payload.json" +DEFAULT_OUTPUT_FULL = "preprocess_output.json" + + +def main(): + parser = argparse.ArgumentParser( + description="Preprocess a file -> JSON (default: LLM payload only)", + ) + parser.add_argument( + "input", + nargs="?", + default=str(DEFAULT_INPUT), + help="File path to preprocess (default: testquery.py)", + ) + parser.add_argument( + "-o", "--output", + default=None, + help="Output JSON path (default: ai_payload.json or preprocess_output.json with --full)", + ) + parser.add_argument( + "--full", + action="store_true", + help="Write full preprocess JSON (includes chunk content; for scanners, not for LLM)", + ) + args = parser.parse_args() + + input_path = Path(args.input) + if not input_path.is_file(): + print(f"Not a file: {input_path}", file=sys.stderr) + sys.exit(1) + + if args.full: + data = run_file(input_path) + out = args.output or DEFAULT_OUTPUT_FULL + else: + data = run_file_ai_payload(input_path) + out = args.output or DEFAULT_OUTPUT_LLM + + out_path = Path(out) + out_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"Wrote {out_path}") + print(f" project_id: {data.get('project_id', '')[:16]}...") + + if args.full: + print(f" mode: full preprocess") + print(f" files: {len(data['files'])}, chunks: {len(data['chunks'])}, skipped: {len(data['files_skipped'])}") + else: + n = len(data.get("normalized_findings") or []) + print(f" mode: LLM payload ({n} findings, no chunk content)") + + +if __name__ == "__main__": + main() diff --git a/testquery.py b/testquery.py new file mode 100644 index 0000000..18cd9e2 --- /dev/null +++ b/testquery.py @@ -0,0 +1,15 @@ +from sqlalchemy import text +from db import engine + +try: + with engine.connect() as conn: + # Run a simple query to test + result = conn.execute(text("SELECT NOW();")) + print("Connected! Server time:", result.fetchone()[0]) +except Exception as e: + print("Connection failed:", e) + +with engine.connect() as conn: + #conn.execute(text("INSERT INTO users(email, password_hash) VALUES ('test2@example.com', '1A2B3C');")) + result = conn.execute(text("SELECT * FROM users;")) + print(result.fetchall())