From 0ed1b94c48ac61aaed7ce616cd73cf3ab0e30242 Mon Sep 17 00:00:00 2001 From: kaye-s Date: Wed, 11 Feb 2026 17:41:22 -0600 Subject: [PATCH 01/18] test! --- main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main.py b/main.py index cedf85d..994422a 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,5 @@ import eel +#sofia eel.init('front-end') From 69a3118b255aff665e2979e9e76620eb68dc6fe4 Mon Sep 17 00:00:00 2001 From: jacob Date: Wed, 11 Feb 2026 17:45:11 -0600 Subject: [PATCH 02/18] Jacob Test --- main.py | 1 + 1 file changed, 1 insertion(+) diff --git a/main.py b/main.py index 994422a..b600c9f 100644 --- a/main.py +++ b/main.py @@ -1,5 +1,6 @@ import eel #sofia +#jacob eel.init('front-end') From 38602780efbbca505da7c3b581f0728f506d8530 Mon Sep 17 00:00:00 2001 From: zhangtingen Date: Wed, 11 Feb 2026 17:56:19 -0600 Subject: [PATCH 03/18] tim --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index b600c9f..aae1daa 100644 --- a/main.py +++ b/main.py @@ -1,7 +1,7 @@ import eel #sofia #jacob - +#tim eel.init('front-end') From 99a712e2ce34ff082fef4071a1dc454823baec33 Mon Sep 17 00:00:00 2001 From: zhangtingen Date: Thu, 12 Feb 2026 12:59:29 -0600 Subject: [PATCH 04/18] connect openAI api to backend --- .gitignore | 1 + main.py | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/.gitignore b/.gitignore index 3557e5d..06125d6 100644 --- a/.gitignore +++ b/.gitignore @@ -189,3 +189,4 @@ gradle-app.setting /.vs/ node_modules/ +.env diff --git a/main.py b/main.py index aae1daa..a64024d 100644 --- a/main.py +++ b/main.py @@ -1,9 +1,25 @@ import eel +import os +from dotenv import load_dotenv +from openai import OpenAI #sofia #jacob #tim eel.init('front-end') +load_dotenv() +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) + +def test_openai(): + if not os.getenv("OPENAI_API_KEY"): + print("ERROR: OPENAI_API_KEY not found. Put it in .env (same folder as main.py).") + return + + resp = client.chat.completions.create( + model="gpt-4.1-mini", + messages=[{"role": "user", "content": "Say hello in one sentence."}], + ) + print("OpenAI test reply:", resp.choices[0].message.content) @eel.expose def add(num1, num2): From 3b4c170e00c2cddc1672cdaed1c605a44dc4f203 Mon Sep 17 00:00:00 2001 From: zhangtingen Date: Thu, 12 Feb 2026 21:30:00 -0600 Subject: [PATCH 05/18] connet frontend-backend-openai api --- front-end/index.html | 23 +++++++++++++---------- front-end/scripts/main.js | 11 +++++++++++ main.py | 26 +++++++++++++++++--------- 3 files changed, 41 insertions(+), 19 deletions(-) diff --git a/front-end/index.html b/front-end/index.html index 3a3ade3..e2ab344 100644 --- a/front-end/index.html +++ b/front-end/index.html @@ -1,25 +1,28 @@ - - Hello World! - + + - - - - - - + + + +
- \ No newline at end of file + + + + +

+
+
diff --git a/front-end/scripts/main.js b/front-end/scripts/main.js
index feeebf0..56a9f67 100644
--- a/front-end/scripts/main.js
+++ b/front-end/scripts/main.js
@@ -6,3 +6,14 @@ function operate(operator) {
 		document.querySelector('#output').innerText = result;
 	});
 }
+
+function askGPT() {
+	const prompt = document.querySelector('#prompt').value;
+
+	document.querySelector('#output').innerText = "Loading...";
+
+	eel.ask_api(prompt)(result => {
+		document.querySelector('#output').innerText = result;
+	});
+
+}
\ No newline at end of file
diff --git a/main.py b/main.py
index a64024d..7a8aab7 100644
--- a/main.py
+++ b/main.py
@@ -8,18 +8,25 @@
 eel.init('front-end')
 
 load_dotenv()
+print("API key loaded:", bool(os.getenv("OPENAI_API_KEY")))
 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
-def test_openai():
-    if not os.getenv("OPENAI_API_KEY"):
-        print("ERROR: OPENAI_API_KEY not found. Put it in .env (same folder as main.py).")
-        return
+@eel.expose
+def ask_api(user_text):
+    print("ask_api received:", user_text)
 
     resp = client.chat.completions.create(
-        model="gpt-4.1-mini",
-        messages=[{"role": "user", "content": "Say hello in one sentence."}],
-    )
-    print("OpenAI test reply:", resp.choices[0].message.content)
+            model="gpt-4.1-mini",
+            messages=[{"role": "user", "content": str(user_text)}],
+        )
+
+    answer = resp.choices[0].message.content
+    print("ask_api answer:", answer)
+    return answer
+
+
+
+
 
 @eel.expose
 def add(num1, num2):
@@ -31,4 +38,5 @@ def subtract(num1, num2):
     return int(num1) - int(num2)
 
 
-eel.start('index.html', size=(1000, 600))
+if __name__ == "__main__":
+    eel.start('index.html', size=(1000, 600), mode='safari')

From 0ae235de84cd195a6a1de2cdcd6cedc70c61990c Mon Sep 17 00:00:00 2001
From: kaye-s 
Date: Mon, 16 Feb 2026 16:13:48 -0600
Subject: [PATCH 06/18] database connected to enviornment

---
 db.py            | 21 ++++++++++++++
 main.py          |  5 ++++
 models.py        | 74 ++++++++++++++++++++++++++++++++++++++++++++++++
 requirements.txt |  3 ++
 testquery.py     | 27 ++++++++++++++++++
 5 files changed, 130 insertions(+)
 create mode 100644 db.py
 create mode 100644 models.py
 create mode 100644 testquery.py

diff --git a/db.py b/db.py
new file mode 100644
index 0000000..4212ab5
--- /dev/null
+++ b/db.py
@@ -0,0 +1,21 @@
+import os
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+from dotenv import load_dotenv
+
+load_dotenv()
+
+DB_URL = f"postgresql://" \
+         f"{os.getenv('DB_USER')}:" \
+         f"{os.getenv('DB_PASS')}@" \
+         f"{os.getenv('DB_HOST')}:" \
+         f"{os.getenv('DB_PORT')}/" \
+         f"{os.getenv('DB_NAME')}"
+
+engine = create_engine(
+    DB_URL,
+    echo=True,
+    connect_args={"sslmode": "require"}
+)
+
+SessionLocal = sessionmaker(bind=engine)
diff --git a/main.py b/main.py
index aae1daa..579b0b6 100644
--- a/main.py
+++ b/main.py
@@ -1,9 +1,14 @@
 import eel
+from db import SessionLocal
 #sofia
 #jacob
 #tim
 eel.init('front-end')
 
+session = SessionLocal()
+
+print("Connected successfully!")
+
 
 @eel.expose
 def add(num1, num2):
diff --git a/models.py b/models.py
new file mode 100644
index 0000000..6da1d64
--- /dev/null
+++ b/models.py
@@ -0,0 +1,74 @@
+import os
+from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, ForeignKey, Numeric, Enum
+from sqlalchemy.orm import declarative_base, relationship, sessionmaker
+from datetime import datetime
+import enum
+from dotenv import load_dotenv
+
+load_dotenv()
+
+Base = declarative_base()
+
+# Severity Enum
+class SeverityEnum(enum.Enum):
+    Low = "Low"
+    Medium = "Medium"
+    High = "High"
+    Critical = "Critical"
+
+
+class User(Base):
+    __tablename__ = "users"
+
+    user_id = Column(Integer, primary_key=True, autoincrement=True)
+    email = Column(String(255), unique=True, nullable=False)
+    password_hash = Column(Text, nullable=False)
+    created_at = Column(DateTime, default=datetime.utcnow)
+
+    submissions = relationship("CodeSubmission", back_populates="user")
+
+
+class CodeSubmission(Base):
+    __tablename__ = "code_submissions"
+
+    submission_id = Column(Integer, primary_key=True, autoincrement=True)
+    user_id = Column(Integer, ForeignKey("users.user_id", ondelete="CASCADE"), nullable=False)
+    submission_name = Column(String(255))
+    uploaded_at = Column(DateTime, default=datetime.utcnow)
+    overall_risk_score = Column(Numeric(5,2))
+    simplified_summary = Column(Text)
+    detailed_summary = Column(Text)
+
+    user = relationship("User", back_populates="submissions")
+    files = relationship("File", back_populates="submission")
+    threats = relationship("Threat", back_populates="submission")
+
+
+class File(Base):
+    __tablename__ = "files"
+
+    file_id = Column(Integer, primary_key=True, autoincrement=True)
+    submission_id = Column(Integer, ForeignKey("code_submissions.submission_id", ondelete="CASCADE"), nullable=False)
+    file_name = Column(String(255), nullable=False)
+    file_path = Column(Text, nullable=False)
+    file_type = Column(String(100))
+
+    submission = relationship("CodeSubmission", back_populates="files")
+    threats = relationship("Threat", back_populates="file")
+
+
+class Threat(Base):
+    __tablename__ = "threats"
+
+    threat_id = Column(Integer, primary_key=True, autoincrement=True)
+    submission_id = Column(Integer, ForeignKey("code_submissions.submission_id", ondelete="CASCADE"), nullable=False)
+    file_id = Column(Integer, ForeignKey("files.file_id", ondelete="SET NULL"), nullable=True)
+    title = Column(String(255), nullable=False)
+    description = Column(Text)
+    severity_level = Column(Enum(SeverityEnum), nullable=False)
+    severity_score = Column(Numeric(5,2))
+    recommendation = Column(Text)
+    line_number = Column(Integer)
+
+    submission = relationship("CodeSubmission", back_populates="threats")
+    file = relationship("File", back_populates="threats")
diff --git a/requirements.txt b/requirements.txt
index e7c13db..c0be71b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,6 @@ pyqrcode
 pyinstaller
 pypng
 autopep8
+psycopg2-binary
+SQLAlchemy
+python-dotenv
diff --git a/testquery.py b/testquery.py
new file mode 100644
index 0000000..cf61dd8
--- /dev/null
+++ b/testquery.py
@@ -0,0 +1,27 @@
+from sqlalchemy import create_engine, text
+
+# Replace these with your Supabase info
+USER = "postgres"
+PASSWORD = "CapstoneVSecurity123"
+HOST = "db.zzraywtbowpotrqbevkz.supabase.co"
+PORT = "5432"
+DATABASE = "postgres"
+
+# This is the connection URL SQLAlchemy needs
+DB_URL = f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{DATABASE}"
+
+# Create a connection
+engine = create_engine(DB_URL)
+
+try:
+    with engine.connect() as conn:
+        # Run a simple query to test
+        result = conn.execute(text("SELECT NOW();"))
+        print("Connected! Server time:", result.fetchone()[0])
+except Exception as e:
+    print("Connection failed:", e)
+
+with engine.connect() as conn:
+    #conn.execute(text("INSERT INTO users(email, password_hash) VALUES ('test2@example.com', '1A2B3C');"))
+    result = conn.execute(text("SELECT * FROM users;"))
+    print(result.fetchall())
\ No newline at end of file

From 5f3d3fad0f303676c92e79b6e25dc38983b82e83 Mon Sep 17 00:00:00 2001
From: kaye-s 
Date: Wed, 18 Feb 2026 12:42:46 -0600
Subject: [PATCH 07/18] database secrets preserved

---
 testquery.py | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/testquery.py b/testquery.py
index cf61dd8..80e1505 100644
--- a/testquery.py
+++ b/testquery.py
@@ -1,17 +1,5 @@
-from sqlalchemy import create_engine, text
-
-# Replace these with your Supabase info
-USER = "postgres"
-PASSWORD = "CapstoneVSecurity123"
-HOST = "db.zzraywtbowpotrqbevkz.supabase.co"
-PORT = "5432"
-DATABASE = "postgres"
-
-# This is the connection URL SQLAlchemy needs
-DB_URL = f"postgresql+psycopg2://{USER}:{PASSWORD}@{HOST}:{PORT}/{DATABASE}"
-
-# Create a connection
-engine = create_engine(DB_URL)
+from sqlalchemy import text
+from db import engine
 
 try:
     with engine.connect() as conn:

From d85a13a877983ca4d5be09842c740bc15c1b3af1 Mon Sep 17 00:00:00 2001
From: kaye-s 
Date: Wed, 18 Feb 2026 12:43:30 -0600
Subject: [PATCH 08/18] Revert "database connected to enviornment"

This reverts commit 0ae235de84cd195a6a1de2cdcd6cedc70c61990c.

# Conflicts:
#	testquery.py
---
 db.py            | 21 --------------
 main.py          |  5 ----
 models.py        | 74 ------------------------------------------------
 requirements.txt |  3 --
 4 files changed, 103 deletions(-)
 delete mode 100644 db.py
 delete mode 100644 models.py

diff --git a/db.py b/db.py
deleted file mode 100644
index 4212ab5..0000000
--- a/db.py
+++ /dev/null
@@ -1,21 +0,0 @@
-import os
-from sqlalchemy import create_engine
-from sqlalchemy.orm import sessionmaker
-from dotenv import load_dotenv
-
-load_dotenv()
-
-DB_URL = f"postgresql://" \
-         f"{os.getenv('DB_USER')}:" \
-         f"{os.getenv('DB_PASS')}@" \
-         f"{os.getenv('DB_HOST')}:" \
-         f"{os.getenv('DB_PORT')}/" \
-         f"{os.getenv('DB_NAME')}"
-
-engine = create_engine(
-    DB_URL,
-    echo=True,
-    connect_args={"sslmode": "require"}
-)
-
-SessionLocal = sessionmaker(bind=engine)
diff --git a/main.py b/main.py
index 579b0b6..aae1daa 100644
--- a/main.py
+++ b/main.py
@@ -1,14 +1,9 @@
 import eel
-from db import SessionLocal
 #sofia
 #jacob
 #tim
 eel.init('front-end')
 
-session = SessionLocal()
-
-print("Connected successfully!")
-
 
 @eel.expose
 def add(num1, num2):
diff --git a/models.py b/models.py
deleted file mode 100644
index 6da1d64..0000000
--- a/models.py
+++ /dev/null
@@ -1,74 +0,0 @@
-import os
-from sqlalchemy import create_engine, Column, Integer, String, Text, DateTime, ForeignKey, Numeric, Enum
-from sqlalchemy.orm import declarative_base, relationship, sessionmaker
-from datetime import datetime
-import enum
-from dotenv import load_dotenv
-
-load_dotenv()
-
-Base = declarative_base()
-
-# Severity Enum
-class SeverityEnum(enum.Enum):
-    Low = "Low"
-    Medium = "Medium"
-    High = "High"
-    Critical = "Critical"
-
-
-class User(Base):
-    __tablename__ = "users"
-
-    user_id = Column(Integer, primary_key=True, autoincrement=True)
-    email = Column(String(255), unique=True, nullable=False)
-    password_hash = Column(Text, nullable=False)
-    created_at = Column(DateTime, default=datetime.utcnow)
-
-    submissions = relationship("CodeSubmission", back_populates="user")
-
-
-class CodeSubmission(Base):
-    __tablename__ = "code_submissions"
-
-    submission_id = Column(Integer, primary_key=True, autoincrement=True)
-    user_id = Column(Integer, ForeignKey("users.user_id", ondelete="CASCADE"), nullable=False)
-    submission_name = Column(String(255))
-    uploaded_at = Column(DateTime, default=datetime.utcnow)
-    overall_risk_score = Column(Numeric(5,2))
-    simplified_summary = Column(Text)
-    detailed_summary = Column(Text)
-
-    user = relationship("User", back_populates="submissions")
-    files = relationship("File", back_populates="submission")
-    threats = relationship("Threat", back_populates="submission")
-
-
-class File(Base):
-    __tablename__ = "files"
-
-    file_id = Column(Integer, primary_key=True, autoincrement=True)
-    submission_id = Column(Integer, ForeignKey("code_submissions.submission_id", ondelete="CASCADE"), nullable=False)
-    file_name = Column(String(255), nullable=False)
-    file_path = Column(Text, nullable=False)
-    file_type = Column(String(100))
-
-    submission = relationship("CodeSubmission", back_populates="files")
-    threats = relationship("Threat", back_populates="file")
-
-
-class Threat(Base):
-    __tablename__ = "threats"
-
-    threat_id = Column(Integer, primary_key=True, autoincrement=True)
-    submission_id = Column(Integer, ForeignKey("code_submissions.submission_id", ondelete="CASCADE"), nullable=False)
-    file_id = Column(Integer, ForeignKey("files.file_id", ondelete="SET NULL"), nullable=True)
-    title = Column(String(255), nullable=False)
-    description = Column(Text)
-    severity_level = Column(Enum(SeverityEnum), nullable=False)
-    severity_score = Column(Numeric(5,2))
-    recommendation = Column(Text)
-    line_number = Column(Integer)
-
-    submission = relationship("CodeSubmission", back_populates="threats")
-    file = relationship("File", back_populates="threats")
diff --git a/requirements.txt b/requirements.txt
index c0be71b..e7c13db 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,6 +3,3 @@ pyqrcode
 pyinstaller
 pypng
 autopep8
-psycopg2-binary
-SQLAlchemy
-python-dotenv

From 8377edc021735d1ec31fb97bfd7acc565c560162 Mon Sep 17 00:00:00 2001
From: kaye-s 
Date: Wed, 18 Feb 2026 12:49:04 -0600
Subject: [PATCH 09/18] OKAY NOW FIXED, be sure to get .env file updated with
 new credentials

---
 db.py            | 21 +++++++++++++++++++++
 requirements.txt |  3 +++
 2 files changed, 24 insertions(+)
 create mode 100644 db.py

diff --git a/db.py b/db.py
new file mode 100644
index 0000000..5afc09b
--- /dev/null
+++ b/db.py
@@ -0,0 +1,21 @@
+import os
+from sqlalchemy import create_engine
+from sqlalchemy.orm import sessionmaker
+from dotenv import load_dotenv
+
+load_dotenv()
+
+DB_URL = f"postgresql://" \
+         f"{os.getenv('DB_USER')}:" \
+         f"{os.getenv('DB_PASS')}@" \
+         f"{os.getenv('DB_HOST')}:" \
+         f"{os.getenv('DB_PORT')}/" \
+         f"{os.getenv('DB_NAME')}"
+
+engine = create_engine(
+    DB_URL,
+    echo=True,
+    connect_args={"sslmode": "require"}
+)
+
+SessionLocal = sessionmaker(bind=engine)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index e7c13db..c0be71b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,6 @@ pyqrcode
 pyinstaller
 pypng
 autopep8
+psycopg2-binary
+SQLAlchemy
+python-dotenv

From 012e4180e0d1721f071b93df7329d7154d0811e0 Mon Sep 17 00:00:00 2001
From: kaye-s 
Date: Wed, 18 Feb 2026 13:08:46 -0600
Subject: [PATCH 10/18] showuser add user database functionality linked up

---
 front-end/index.html      |  7 +++++++
 front-end/scripts/main.js | 15 +++++++++++++++
 main.py                   | 28 +++++++++++++++++++++++++++-
 3 files changed, 49 insertions(+), 1 deletion(-)

diff --git a/front-end/index.html b/front-end/index.html
index 3a3ade3..bd942c2 100644
--- a/front-end/index.html
+++ b/front-end/index.html
@@ -19,6 +19,13 @@
     
     
     
+    
+
+ + + + + diff --git a/front-end/scripts/main.js b/front-end/scripts/main.js index feeebf0..133a2ba 100644 --- a/front-end/scripts/main.js +++ b/front-end/scripts/main.js @@ -6,3 +6,18 @@ function operate(operator) { document.querySelector('#output').innerText = result; }); } + +function loadUsers() { + eel.showUsers()(users => { + document.querySelector('#output').innerText = JSON.stringify(users, null, 2); + }); +} + +function addUsers() { + var email = document.querySelector('#email').value; + var password = document.querySelector('#pass').value; + + eel.addUsers(email, password)(response => { + document.querySelector('#output').innerText = response; + }); +} \ No newline at end of file diff --git a/main.py b/main.py index aae1daa..e94e512 100644 --- a/main.py +++ b/main.py @@ -1,18 +1,44 @@ import eel +from db import engine +from sqlalchemy import text #sofia #jacob #tim eel.init('front-end') +try: + with engine.connect() as conn: + # Run a simple query to test + result = conn.execute(text("SELECT NOW();")) + print("Connected! Server time:", result.fetchone()[0]) +except Exception as e: + print("Connection failed:", e) @eel.expose def add(num1, num2): return int(num1) + int(num2) - @eel.expose def subtract(num1, num2): return int(num1) - int(num2) +@eel.expose +def showUsers(): + with engine.connect() as conn: + result = conn.execute(text("SELECT * FROM users;")) + users = result.fetchall() + return [dict(row._mapping) for row in users] + +@eel.expose +def addUsers(email, password): + #hashing logic here + + with engine.begin() as conn: # auto-commit + conn.execute( + text("INSERT INTO users (email, password_hash) VALUES (:email, :password)"), + {"email": email, "password": password} + ) + return "User added successfully" + eel.start('index.html', size=(1000, 600)) From c7beafbe103ffc9135c4d150d26a94ada9f71869 Mon Sep 17 00:00:00 2001 From: kaye-s Date: Wed, 18 Feb 2026 13:10:59 -0600 Subject: [PATCH 11/18] password hashing --- main.py | 4 +++- requirements.txt | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index e94e512..b94ee97 100644 --- a/main.py +++ b/main.py @@ -1,6 +1,7 @@ import eel from db import engine from sqlalchemy import text +import bcrypt #sofia #jacob #tim @@ -32,11 +33,12 @@ def showUsers(): @eel.expose def addUsers(email, password): #hashing logic here + hashed = bcrypt.hashpw(password.encode(), bcrypt.gensalt()).decode() with engine.begin() as conn: # auto-commit conn.execute( text("INSERT INTO users (email, password_hash) VALUES (:email, :password)"), - {"email": email, "password": password} + {"email": email, "password": hashed} ) return "User added successfully" diff --git a/requirements.txt b/requirements.txt index c0be71b..c5e047d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -6,3 +6,4 @@ autopep8 psycopg2-binary SQLAlchemy python-dotenv +bcrypt From a6c2e34d99e70891fb1da8401eb1c25de60e60fe Mon Sep 17 00:00:00 2001 From: kaye-s Date: Wed, 18 Feb 2026 17:17:14 -0600 Subject: [PATCH 12/18] Moved db queries to new html file --- front-end/db_queries.html | 32 ++++++++++++++++++++++++++++++++ front-end/index.html | 17 +++++------------ 2 files changed, 37 insertions(+), 12 deletions(-) create mode 100644 front-end/db_queries.html diff --git a/front-end/db_queries.html b/front-end/db_queries.html new file mode 100644 index 0000000..f840170 --- /dev/null +++ b/front-end/db_queries.html @@ -0,0 +1,32 @@ + + + + + + + + + Hello World! + + + + + + + + + + + + +
+
+ + + + + + + + + \ No newline at end of file diff --git a/front-end/index.html b/front-end/index.html index bd942c2..112bd20 100644 --- a/front-end/index.html +++ b/front-end/index.html @@ -15,18 +15,11 @@ - - - - -
-
- - - - - - + + + + + \ No newline at end of file From 8ee5c8783a76880528b755f1490b9de30b5a09fc Mon Sep 17 00:00:00 2001 From: jacob Date: Wed, 18 Feb 2026 17:27:34 -0600 Subject: [PATCH 13/18] Initial backend commit: barebones backend inside app folder GroupFive, with model for AnalysisTask including request structure, user, language etc. Some simple tests on creation, APIView endpoint for analysis task and status endpoint. Uses serializer to interpret data from frontend --- GroupFive/__init__.py | 0 GroupFive/admin.py | 3 + GroupFive/apps.py | 6 ++ GroupFive/dummy_analysis.py | 13 +++ GroupFive/migrations/0001_initial.py | 30 +++++++ GroupFive/migrations/__init__.py | 0 GroupFive/models.py | 21 +++++ GroupFive/serializers.py | 7 ++ GroupFive/tasks.py | 22 +++++ GroupFive/tests.py | 60 +++++++++++++ GroupFive/views.py | 42 +++++++++ config/__init__.py | 0 config/asgi.py | 16 ++++ config/settings.py | 125 +++++++++++++++++++++++++++ config/urls.py | 26 ++++++ config/wsgi.py | 16 ++++ manage.py | 22 +++++ 17 files changed, 409 insertions(+) create mode 100644 GroupFive/__init__.py create mode 100644 GroupFive/admin.py create mode 100644 GroupFive/apps.py create mode 100644 GroupFive/dummy_analysis.py create mode 100644 GroupFive/migrations/0001_initial.py create mode 100644 GroupFive/migrations/__init__.py create mode 100644 GroupFive/models.py create mode 100644 GroupFive/serializers.py create mode 100644 GroupFive/tasks.py create mode 100644 GroupFive/tests.py create mode 100644 GroupFive/views.py create mode 100644 config/__init__.py create mode 100644 config/asgi.py create mode 100644 config/settings.py create mode 100644 config/urls.py create mode 100644 config/wsgi.py create mode 100644 manage.py diff --git a/GroupFive/__init__.py b/GroupFive/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/GroupFive/admin.py b/GroupFive/admin.py new file mode 100644 index 0000000..8c38f3f --- /dev/null +++ b/GroupFive/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/GroupFive/apps.py b/GroupFive/apps.py new file mode 100644 index 0000000..8220433 --- /dev/null +++ b/GroupFive/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class OurApplicationConfig(AppConfig): + default_auto_field = 'django.db.models.BigAutoField' + name = 'GroupFive' diff --git a/GroupFive/dummy_analysis.py b/GroupFive/dummy_analysis.py new file mode 100644 index 0000000..992ba89 --- /dev/null +++ b/GroupFive/dummy_analysis.py @@ -0,0 +1,13 @@ + +def run_dummy(code, language): + + return { + "summary" : "this dummy code is better than yours", + "findings" : [ + { + "severity" : "Minimal", + "description" : "Bad code", + "fix" : "Figure it Out" + } + ] + } \ No newline at end of file diff --git a/GroupFive/migrations/0001_initial.py b/GroupFive/migrations/0001_initial.py new file mode 100644 index 0000000..b8f9b01 --- /dev/null +++ b/GroupFive/migrations/0001_initial.py @@ -0,0 +1,30 @@ +# Generated by Django 5.0.3 on 2026-02-18 09:51 + +import django.db.models.deletion +import uuid +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name='AnalysisTask', + fields=[ + ('id', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)), + ('input_code', models.TextField()), + ('language', models.CharField(max_length=50)), + ('status', models.CharField(choices=[('QUEUED', 'Queued'), ('RUNNING', 'Running'), ('COMPLETED', 'Completed'), ('FAILED', 'Failed')], max_length=20)), + ('results', models.JSONField(blank=True, null=True)), + ('created_at', models.DateTimeField(auto_now_add=True)), + ('user', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + ), + ] diff --git a/GroupFive/migrations/__init__.py b/GroupFive/migrations/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/GroupFive/models.py b/GroupFive/models.py new file mode 100644 index 0000000..f6f0ea7 --- /dev/null +++ b/GroupFive/models.py @@ -0,0 +1,21 @@ +#all id related lines are noted and can be deleted or changed if user id is skipped or substituted +import uuid #for user ID +from django.db import models +from django.contrib.auth.models import User + +class AnalysisTask(models.Model): + #potential review request statuses + STATUS_OPT = [ + ("QUEUED", "Queued"), + ("RUNNING", "Running"), + ("COMPLETED", "Completed"), + ("FAILED", "Failed") + ] + + id = models.UUIDField(primary_key=True, default=uuid.uuid4) #more user id + user = models.ForeignKey(User, on_delete=models.CASCADE) #user id/user + input_code = models.TextField() #user provided code + language = models.CharField(max_length=50) #language of user provided code + status = models.CharField(max_length=20, choices=STATUS_OPT) #status of review request + results = models.JSONField(null=True, blank=True) #results of review + created_at = models.DateTimeField(auto_now_add=True) #creation timestamp diff --git a/GroupFive/serializers.py b/GroupFive/serializers.py new file mode 100644 index 0000000..255c15e --- /dev/null +++ b/GroupFive/serializers.py @@ -0,0 +1,7 @@ +#this file uses serializers to define what information we add to our AnalysisTask model from user +from rest_framework import serializers + +class AnalysisRequestSerializer(serializers.Serializer): + code = serializers.CharField() #for input code + #language definition of input code, can be commented out if language distinction added later + language = serializers.CharField() diff --git a/GroupFive/tasks.py b/GroupFive/tasks.py new file mode 100644 index 0000000..24c186e --- /dev/null +++ b/GroupFive/tasks.py @@ -0,0 +1,22 @@ +#from celery import shared_task #task queue to handle simultaneous requests, making testing annoying for now can readd later when necessary +from GroupFive.models import AnalysisTask +from .dummy_analysis import run_dummy + +#@shared_task --from celery, readd later +def run_analysis_async(task_id): + + #instance of analysisTask + task = AnalysisTask.objects.get(id=task_id) + task.status = "RUNNING" #update status + task.save() #save instance task + + try: + #call ai api rather than dummy + results = run_dummy(task.input_code, task.language) + + task.results = results #store results + task.status = "COMPLETED" #update status + except Exception(BaseException) as e: + task.status = "FAILED" + + task.save() \ No newline at end of file diff --git a/GroupFive/tests.py b/GroupFive/tests.py new file mode 100644 index 0000000..5f5da37 --- /dev/null +++ b/GroupFive/tests.py @@ -0,0 +1,60 @@ +from rest_framework.test import APITestCase +from django.contrib.auth.models import User +from rest_framework import status +from .models import * +from uuid import uuid4 + + +class InitialAnalysisTests(APITestCase): + + def setUp(self): + #create user + self.User = User.objects.create_user( + username="username", + password="password" + ) + self.client.login(username="username", password="password") + + def test_create_analysisTask(self): + + response = self.client.post("/api/GroupFive/",{ + "code" : "print('Hello World')", #code to analyze + "language" : "Python" #language of code + }, format="json") + + self.assertEqual(response.status_code, status.HTTP_200_OK) + self.assertIn("task_id", response.data) #task_id in data + self.assertEqual(response.data["status"], "QUEUED") + +class InitialWorkflowTest(APITestCase): + + def setUp(self): + #create user + self.User = User.objects.create_user( + username="username", + password="password" + ) + self.client.login(username="username", password="password") + + def test_initial_workflow(self): + response = self.client.post("/api/GroupFive/",{ + "code" : "print('Hello Again')", #code to analyze + "language" : "Python" #language of code + }, format="json") + + self.assertEqual(response.status_code, status.HTTP_200_OK) + + task_id = response.data["task_id"] + + task = AnalysisTask.objects.get(id=task_id) + + #confirm that dummy ran + self.assertEqual(task.status, "COMPLETED") + + result_response = self.client.get(f"/api/GroupFive/{task_id}") + + #ensure task endpoint + self.assertEqual(result_response.status_code, 200) + + + diff --git a/GroupFive/views.py b/GroupFive/views.py new file mode 100644 index 0000000..01cfa8e --- /dev/null +++ b/GroupFive/views.py @@ -0,0 +1,42 @@ +#all id related lines are noted and can be deleted or changed if user id is skipped or substituted +from rest_framework.views import APIView +from rest_framework.response import Response +from rest_framework.permissions import IsAuthenticated #for user id +from GroupFive.models import AnalysisTask +from GroupFive.serializers import AnalysisRequestSerializer +from .tasks import run_analysis_async + + +#analysis task endpoint +class AnalysisView(APIView): + permission_classes = [IsAuthenticated] + + def post(self, request): + serializer = AnalysisRequestSerializer(data=request.data) + serializer.is_valid(raise_exception=True) #deserialize, check correct input and format, raises 400 Bad Request on fail + + task = AnalysisTask.objects.create( + user=request.user, #user + input_code=serializer.validated_data["code"], + language=serializer.validated_data["language"], + status="QUEUED" + ) + + run_analysis_async(str(task.id)) + + return Response({ + "task_id": str(task.id), + "status": task.status + }) + +#status endpoint +class StatusView(APIView): + permission_classes = [IsAuthenticated] + + def get(self, request, task_id): + task = AnalysisTask.objects.get(id=task_id, user=request.user) #user + + return Response({ + "status": task.status, + "summary": task.results if task.status == "COMPLETED" else None + }) \ No newline at end of file diff --git a/config/__init__.py b/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/config/asgi.py b/config/asgi.py new file mode 100644 index 0000000..39149a0 --- /dev/null +++ b/config/asgi.py @@ -0,0 +1,16 @@ +""" +ASGI config for config project. + +It exposes the ASGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/5.0/howto/deployment/asgi/ +""" + +import os + +from django.core.asgi import get_asgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings') + +application = get_asgi_application() diff --git a/config/settings.py b/config/settings.py new file mode 100644 index 0000000..4a3bd85 --- /dev/null +++ b/config/settings.py @@ -0,0 +1,125 @@ +""" +Django settings for config project. + +Generated by 'django-admin startproject' using Django 5.0.3. + +For more information on this file, see +https://docs.djangoproject.com/en/5.0/topics/settings/ + +For the full list of settings and their values, see +https://docs.djangoproject.com/en/5.0/ref/settings/ +""" + +from pathlib import Path + +# Build paths inside the project like this: BASE_DIR / 'subdir'. +BASE_DIR = Path(__file__).resolve().parent.parent + + +# Quick-start development settings - unsuitable for production +# See https://docs.djangoproject.com/en/5.0/howto/deployment/checklist/ + +# SECURITY WARNING: keep the secret key used in production secret! +SECRET_KEY = 'django-insecure-y+j3zht6sr%!!2fg0&-ek^21&)yc+y+5a*-ly+@16$8$px)a$@' + +# SECURITY WARNING: don't run with debug turned on in production! +DEBUG = True + +ALLOWED_HOSTS = [] + + +# Application definition + +INSTALLED_APPS = [ + 'django.contrib.admin', + 'django.contrib.auth', + 'django.contrib.contenttypes', + 'django.contrib.sessions', + 'django.contrib.messages', + 'django.contrib.staticfiles', + 'GroupFive', + 'rest_framework' +] + +MIDDLEWARE = [ + 'django.middleware.security.SecurityMiddleware', + 'django.contrib.sessions.middleware.SessionMiddleware', + 'django.middleware.common.CommonMiddleware', + 'django.middleware.csrf.CsrfViewMiddleware', + 'django.contrib.auth.middleware.AuthenticationMiddleware', + 'django.contrib.messages.middleware.MessageMiddleware', + 'django.middleware.clickjacking.XFrameOptionsMiddleware', +] + +ROOT_URLCONF = 'config.urls' + +TEMPLATES = [ + { + 'BACKEND': 'django.template.backends.django.DjangoTemplates', + 'DIRS': [], + 'APP_DIRS': True, + 'OPTIONS': { + 'context_processors': [ + 'django.template.context_processors.debug', + 'django.template.context_processors.request', + 'django.contrib.auth.context_processors.auth', + 'django.contrib.messages.context_processors.messages', + ], + }, + }, +] + +WSGI_APPLICATION = 'config.wsgi.application' + + +# Database +# https://docs.djangoproject.com/en/5.0/ref/settings/#databases + +DATABASES = { + 'default': { + 'ENGINE': 'django.db.backends.sqlite3', + 'NAME': BASE_DIR / 'db.sqlite3', + } +} + + +# Password validation +# https://docs.djangoproject.com/en/5.0/ref/settings/#auth-password-validators + +AUTH_PASSWORD_VALIDATORS = [ + { + 'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator', + }, + { + 'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator', + }, +] + + +# Internationalization +# https://docs.djangoproject.com/en/5.0/topics/i18n/ + +LANGUAGE_CODE = 'en-us' + +TIME_ZONE = 'UTC' + +USE_I18N = True + +USE_TZ = True + + +# Static files (CSS, JavaScript, Images) +# https://docs.djangoproject.com/en/5.0/howto/static-files/ + +STATIC_URL = 'static/' + +# Default primary key field type +# https://docs.djangoproject.com/en/5.0/ref/settings/#default-auto-field + +DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' diff --git a/config/urls.py b/config/urls.py new file mode 100644 index 0000000..a770eb4 --- /dev/null +++ b/config/urls.py @@ -0,0 +1,26 @@ +""" +URL configuration for config project. + +The `urlpatterns` list routes URLs to views. For more information please see: + https://docs.djangoproject.com/en/5.0/topics/http/urls/ +Examples: +Function views + 1. Add an import: from my_app import views + 2. Add a URL to urlpatterns: path('', views.home, name='home') +Class-based views + 1. Add an import: from other_app.views import Home + 2. Add a URL to urlpatterns: path('', Home.as_view(), name='home') +Including another URLconf + 1. Import the include() function: from django.urls import include, path + 2. Add a URL to urlpatterns: path('blog/', include('blog.urls')) +""" +from django.contrib import admin +from django.urls import path + +from GroupFive.views import AnalysisView + +urlpatterns = [ + path('admin/', admin.site.urls), + path('api/GroupFive/', AnalysisView.as_view(), name='GroupFive'), + path('api/GroupFive/', ) +] diff --git a/config/wsgi.py b/config/wsgi.py new file mode 100644 index 0000000..c0a9631 --- /dev/null +++ b/config/wsgi.py @@ -0,0 +1,16 @@ +""" +WSGI config for config project. + +It exposes the WSGI callable as a module-level variable named ``application``. + +For more information on this file, see +https://docs.djangoproject.com/en/5.0/howto/deployment/wsgi/ +""" + +import os + +from django.core.wsgi import get_wsgi_application + +os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings') + +application = get_wsgi_application() diff --git a/manage.py b/manage.py new file mode 100644 index 0000000..8e7ac79 --- /dev/null +++ b/manage.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python +"""Django's command-line utility for administrative tasks.""" +import os +import sys + + +def main(): + """Run administrative tasks.""" + os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'config.settings') + try: + from django.core.management import execute_from_command_line + except ImportError as exc: + raise ImportError( + "Couldn't import Django. Are you sure it's installed and " + "available on your PYTHONPATH environment variable? Did you " + "forget to activate a virtual environment?" + ) from exc + execute_from_command_line(sys.argv) + + +if __name__ == '__main__': + main() From d74cf23214a4bb693bd096fff49f489f0374ae20 Mon Sep 17 00:00:00 2001 From: NathanEdwards2023 Date: Wed, 18 Feb 2026 18:29:36 -0600 Subject: [PATCH 14/18] Frontend UI --- front-end/index.html | 113 ++++++++++++++--- front-end/scripts/upload.js | 14 +++ front-end/styles/style.css | 245 ++++++++++++++++++++++++++++++++++++ 3 files changed, 357 insertions(+), 15 deletions(-) create mode 100644 front-end/scripts/upload.js diff --git a/front-end/index.html b/front-end/index.html index 3a3ade3..f5ee704 100644 --- a/front-end/index.html +++ b/front-end/index.html @@ -1,25 +1,108 @@ - - - - - Hello World! - - - + AutoPen Dashboard + + + - - - - - - - + + +
+
+

Penetration Testing Dashboard

+
System Status: Active
+
+ +
+
+

Total Scans

+

128

+
+
+

Critical Vulnerabilities

+

12

+
+
+

Medium Vulnerabilities

+

34

+
+
+

Low Vulnerabilities

+

56

+
+
+ + +
+
+

Upload Code for Analysis

+
+ + +
+ + +
+ +
+ + +
+ +
+ + +
+
+ + + +
+

Recent Scan Results

+ + + + + + + + + + + + + + + + + + + + + + + +
TargetDateRisk LevelStatus
example.com02/14/2026CriticalCompleted
test-server.net02/12/2026LowCompleted
+
+
+ + \ No newline at end of file diff --git a/front-end/scripts/upload.js b/front-end/scripts/upload.js new file mode 100644 index 0000000..4d33a0b --- /dev/null +++ b/front-end/scripts/upload.js @@ -0,0 +1,14 @@ +const tabButtons = document.querySelectorAll(".tab-btn"); +const tabContents = document.querySelectorAll(".tab-content"); + +tabButtons.forEach(button => { + button.addEventListener("click", () => { + // Remove active state + tabButtons.forEach(btn => btn.classList.remove("active")); + tabContents.forEach(tab => tab.classList.remove("active")); + + // Activate selected tab + button.classList.add("active"); + document.getElementById(button.dataset.tab).classList.add("active"); + }); +}); \ No newline at end of file diff --git a/front-end/styles/style.css b/front-end/styles/style.css index e69de29..8357d12 100644 --- a/front-end/styles/style.css +++ b/front-end/styles/style.css @@ -0,0 +1,245 @@ +/* ================= RESET ================= */ +* { + margin: 0; + padding: 0; + box-sizing: border-box; + font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; +} + +body { + background: #000000; + color: #e5e7eb; + display: flex; + min-height: 100vh; +} + +/* ================= SIDEBAR ================= */ +.sidebar { + width: 250px; + height: 100vh; + background: rgba(17, 24, 39, 0.8); + padding: 20px; + border-right: 1px solid rgba(168, 85, 247, 0.2); +} + +.sidebar h2 { + color: #e9d5ff; + margin-bottom: 40px; + text-align: center; +} + +.sidebar ul { + list-style: none; +} + +.sidebar ul li { + padding: 15px; + margin: 10px 0; + background: rgba(17, 24, 39, 0.8); + border: 1px solid rgba(168, 85, 247, 0.2); + border-radius: 8px; + cursor: pointer; + transition: 0.25s; +} + +.sidebar ul li:hover { + background: rgba(17, 24, 39, 0.95); +} + +/* ================= MAIN ================= */ +.main { + flex: 1; + padding: 40px; +} + +/* ================= HEADER ================= */ +.header { + display: flex; + justify-content: space-between; + align-items: center; + margin-bottom: 40px; +} + +.header h1 { + color: #ffffff; +} + +.status { + background: rgba(17, 24, 39, 0.8); + padding: 10px 20px; + border-radius: 999px; + border: 1px solid rgba(168, 85, 247, 0.2); +} + +/* ================= UPLOAD SECTION ================= */ +.code-center { + margin-bottom: 50px; +} + +.code-box { + background: rgba(17, 24, 39, 0.8); + border: 1px solid rgba(168, 85, 247, 0.2); + border-radius: 12px; + padding: 30px; + width: 100%; + max-width: 900px; +} + +.code-box h2 { + color: #ffffff; + margin-bottom: 20px; +} + +/* ================= TABS ================= */ +.upload-tabs { + display: flex; + gap: 12px; + margin-bottom: 20px; +} + +.tab-btn { + background: transparent; + border: 1px solid rgba(168, 85, 247, 0.2); + color: #e9d5ff; + padding: 8px 22px; + border-radius: 999px; + cursor: pointer; + transition: 0.25s; +} + +.tab-btn:hover { + background: rgba(168, 85, 247, 0.15); +} + +.tab-btn.active { + background: rgba(168, 85, 247, 0.25); +} + +/* ================= TAB CONTENT ================= */ +.tab-content { + display: none; +} + +.tab-content.active { + display: block; +} + +/* ================= DROP ZONE ================= */ +.drop-zone { + display: block; + padding: 50px; + border: 2px dashed rgba(168, 85, 247, 0.8); + border-radius: 12px; + background: rgba(17, 24, 39, 0.6); + text-align: center; + cursor: pointer; + transition: 0.25s; + width: 80%; + max-width: 600px; + margin: 0 auto; +} + +.drop-zone:hover { + background: rgba(17, 24, 39, 0.9); +} + +.drop-zone p { + font-size: 18px; + margin-bottom: 6px; +} + +.drop-zone span { + font-size: 14px; + color: #c4b5fd; +} + +/* ================= TEXTAREA ================= */ +textarea { + width: 100%; + min-height: 220px; + background: rgba(17, 24, 39, 0.6); + color: #ffffff; + border: 1px solid rgba(168, 85, 247, 0.2); + border-radius: 8px; + padding: 15px; + resize: vertical; +} + +/* ================= BUTTON ================= */ +.scan-btn { + margin-top: 20px; + padding: 10px 30px; + background: rgba(168, 85, 247, 0.85); + color: white; + border: none; + border-radius: 999px; + cursor: pointer; + transition: 0.25s; +} + +.scan-btn:hover { + background: rgba(168, 85, 247, 1); +} + +/* ================= CARDS ================= */ +.cards { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(250px, 1fr)); + gap: 20px; + margin-bottom: 40px; +} + +.card { + background: rgba(17, 24, 39, 0.8); + border: 1px solid rgba(168, 85, 247, 0.2); + padding: 20px; + border-radius: 12px; + transition: 0.25s; +} + +.card:hover { + background: rgba(17, 24, 39, 0.95); +} + +.card h3 { + color: #ffffff; + margin-bottom: 10px; +} + +.card p { + font-size: 28px; + font-weight: bold; +} + +/* ================= TABLE ================= */ +.recent-scans { + background: rgba(17, 24, 39, 0.8); + border: 1px solid rgba(168, 85, 247, 0.2); + padding: 20px; + border-radius: 12px; +} + +table { + width: 100%; + border-collapse: collapse; + margin-top: 15px; +} + +th, td { + padding: 12px; + text-align: left; +} + +th { + color: #e9d5ff; + border-bottom: 1px solid rgba(168, 85, 247, 0.2); +} + +tr:hover { + background: rgba(168, 85, 247, 0.05); +} + +/* ================= SEVERITY COLORS ================= */ +.critical { color: #fb7185; font-weight: bold; } +.medium { color: #facc15; font-weight: bold; } +.low { color: #34d399; font-weight: bold; } From 71e9bd44ce2b47036bdcfdb6a979627fe22e5d8f Mon Sep 17 00:00:00 2001 From: zhangtingen Date: Wed, 4 Mar 2026 01:46:05 -0600 Subject: [PATCH 15/18] create the prescan function with semgrep and gitleak --- front-end/scripts/main.js | 5 +- main.py | 2 + prescan.py | 123 ++++++++++++++++++++++++++++++++++++++ prescan_report.json | 13 ++++ requirements.txt | 2 + 5 files changed, 142 insertions(+), 3 deletions(-) create mode 100644 prescan.py create mode 100644 prescan_report.json diff --git a/front-end/scripts/main.js b/front-end/scripts/main.js index 38845f8..fc76eff 100644 --- a/front-end/scripts/main.js +++ b/front-end/scripts/main.js @@ -25,8 +25,7 @@ function askGPT() { document.querySelector('#output').innerText = "Loading..."; - eel.ask_api(prompt)(result => { + let newVar = eel.ask_api(prompt)(result => { document.querySelector('#output').innerText = result; - }); + });}} -} \ No newline at end of file diff --git a/main.py b/main.py index c97d798..1aef0c6 100644 --- a/main.py +++ b/main.py @@ -8,6 +8,8 @@ #sofia #jacob #tim +#Nathan +#Sid eel.init('front-end') try: diff --git a/prescan.py b/prescan.py new file mode 100644 index 0000000..086cdc5 --- /dev/null +++ b/prescan.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python3 +""" +Pre-scan 模組:對指定檔案或目錄執行 semgrep 與 gitleaks,輸出合併的 JSON 報告。 +使用方式: + python prescan.py [輸入路徑] [-o 輸出.json] + 不給路徑時預設掃描當前目錄。 +""" + +import argparse +import json +import os +import subprocess +import sys +from pathlib import Path + + +def run_semgrep(target_path: str) -> dict: + """對 target_path 執行 semgrep,回傳 JSON 結果。失敗或未安裝則回傳空結構。""" + path = Path(target_path).resolve() + if not path.exists(): + return {"tool": "semgrep", "error": f"path not found: {target_path}", "results": []} + try: + cmd = [ + sys.executable, "-m", "semgrep", "scan", + "--config", "auto", + "--json", + "--quiet", + str(path), + ] + out = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=300, + cwd=os.getcwd(), + ) + if out.returncode != 0 and not out.stdout.strip(): + return { + "tool": "semgrep", + "error": out.stderr.strip() or f"exit code {out.returncode}", + "results": [], + } + data = json.loads(out.stdout) if out.stdout.strip() else {} + return {"tool": "semgrep", "error": None, "results": data.get("results", data)} + except FileNotFoundError: + return {"tool": "semgrep", "error": "semgrep not installed (pip install semgrep)", "results": []} + except subprocess.TimeoutExpired: + return {"tool": "semgrep", "error": "timeout", "results": []} + except json.JSONDecodeError as e: + return {"tool": "semgrep", "error": str(e), "results": []} + + +def run_gitleaks(target_path: str) -> dict: + """對 target_path 執行 gitleaks detect,回傳 JSON 結果。未安裝則回傳空結構。""" + path = Path(target_path).resolve() + if not path.exists(): + return {"tool": "gitleaks", "error": f"path not found: {target_path}", "results": []} + source = str(path) if path.is_dir() else str(path.parent) + try: + cmd = [ + "gitleaks", "detect", + "--source", source, + "--no-git", + "--report-format", "json", + "--report-path", "-", # 將 JSON 輸出到 stdout + ] + out = subprocess.run( + cmd, + capture_output=True, + text=True, + timeout=120, + ) + # gitleaks 找到 secret 時 exit code 可能為 1,但 stdout 仍有 JSON + raw = out.stdout.strip() + if not raw: + return {"tool": "gitleaks", "error": None, "results": []} + try: + data = json.loads(raw) + results = data if isinstance(data, list) else data.get("findings", data.get("results", [])) + except json.JSONDecodeError: + results = [] + return {"tool": "gitleaks", "error": None, "results": results} + except FileNotFoundError: + return {"tool": "gitleaks", "error": "gitleaks not installed", "results": []} + except subprocess.TimeoutExpired: + return {"tool": "gitleaks", "error": "timeout", "results": []} + + +def main(): + parser = argparse.ArgumentParser(description="Pre-scan: semgrep + gitleaks -> JSON report") + parser.add_argument( + "input_path", + nargs="?", + default=".", + help="要掃描的檔案或目錄路徑(預設: 當前目錄)", + ) + parser.add_argument( + "-o", "--output", + default="prescan_report.json", + help="輸出的 JSON 檔案路徑(預設: prescan_report.json)", + ) + args = parser.parse_args() + + input_path = os.path.normpath(args.input_path) + if not os.path.exists(input_path): + print(f"錯誤:找不到路徑 {input_path}", file=sys.stderr) + sys.exit(1) + + report = { + "input_path": os.path.abspath(input_path), + "semgrep": run_semgrep(input_path), + "gitleaks": run_gitleaks(input_path), + } + + out_path = args.output + with open(out_path, "w", encoding="utf-8") as f: + json.dump(report, f, ensure_ascii=False, indent=2) + + print(f"已寫入: {out_path}") + + +if __name__ == "__main__": + main() diff --git a/prescan_report.json b/prescan_report.json new file mode 100644 index 0000000..63f87d9 --- /dev/null +++ b/prescan_report.json @@ -0,0 +1,13 @@ +{ + "input_path": "/Users/zhangtingen/Downloads/V/testquery.py", + "semgrep": { + "tool": "semgrep", + "error": "Using `python -m semgrep` to run Semgrep is deprecated as of 1.38.0. Please simply run `semgrep` instead.", + "results": [] + }, + "gitleaks": { + "tool": "gitleaks", + "error": "gitleaks not installed", + "results": [] + } +} \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index c5e047d..aa31d5f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ eel +engine pyqrcode pyinstaller pypng @@ -7,3 +8,4 @@ psycopg2-binary SQLAlchemy python-dotenv bcrypt +semgrep From 80af157e75c5a96c84a4a270d75ef57eb160b817 Mon Sep 17 00:00:00 2001 From: zhangtingen Date: Wed, 4 Mar 2026 13:05:01 -0600 Subject: [PATCH 16/18] create the prescan function with semgrep and gitleak --- prescan.py | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/prescan.py b/prescan.py index 086cdc5..e6d51ca 100644 --- a/prescan.py +++ b/prescan.py @@ -1,11 +1,4 @@ #!/usr/bin/env python3 -""" -Pre-scan 模組:對指定檔案或目錄執行 semgrep 與 gitleaks,輸出合併的 JSON 報告。 -使用方式: - python prescan.py [輸入路徑] [-o 輸出.json] - 不給路徑時預設掃描當前目錄。 -""" - import argparse import json import os @@ -15,7 +8,7 @@ def run_semgrep(target_path: str) -> dict: - """對 target_path 執行 semgrep,回傳 JSON 結果。失敗或未安裝則回傳空結構。""" + path = Path(target_path).resolve() if not path.exists(): return {"tool": "semgrep", "error": f"path not found: {target_path}", "results": []} @@ -51,7 +44,7 @@ def run_semgrep(target_path: str) -> dict: def run_gitleaks(target_path: str) -> dict: - """對 target_path 執行 gitleaks detect,回傳 JSON 結果。未安裝則回傳空結構。""" + """Run gitleaks detect on target_path and return JSON results. Empty structure if unavailable.""" path = Path(target_path).resolve() if not path.exists(): return {"tool": "gitleaks", "error": f"path not found: {target_path}", "results": []} @@ -62,7 +55,7 @@ def run_gitleaks(target_path: str) -> dict: "--source", source, "--no-git", "--report-format", "json", - "--report-path", "-", # 將 JSON 輸出到 stdout + "--report-path", "-", # write JSON to stdout ] out = subprocess.run( cmd, @@ -70,7 +63,7 @@ def run_gitleaks(target_path: str) -> dict: text=True, timeout=120, ) - # gitleaks 找到 secret 時 exit code 可能為 1,但 stdout 仍有 JSON + # when gitleaks finds secrets exit code may be 1, but stdout still contains JSON raw = out.stdout.strip() if not raw: return {"tool": "gitleaks", "error": None, "results": []} @@ -92,18 +85,18 @@ def main(): "input_path", nargs="?", default=".", - help="要掃描的檔案或目錄路徑(預設: 當前目錄)", + help="File or directory path to scan (default: current directory)", ) parser.add_argument( "-o", "--output", default="prescan_report.json", - help="輸出的 JSON 檔案路徑(預設: prescan_report.json)", + help="Output JSON file path (default: prescan_report.json)", ) args = parser.parse_args() input_path = os.path.normpath(args.input_path) if not os.path.exists(input_path): - print(f"錯誤:找不到路徑 {input_path}", file=sys.stderr) + print(f"Error: path not found {input_path}", file=sys.stderr) sys.exit(1) report = { @@ -116,7 +109,7 @@ def main(): with open(out_path, "w", encoding="utf-8") as f: json.dump(report, f, ensure_ascii=False, indent=2) - print(f"已寫入: {out_path}") + print(f"Report written to: {out_path}") if __name__ == "__main__": From 5db65b6ea526942b848d324875f72287c8d1e2a4 Mon Sep 17 00:00:00 2001 From: NathanEdwards2023 Date: Wed, 4 Mar 2026 21:22:28 -0600 Subject: [PATCH 17/18] Simple Login and Register pages. Needs to be linked to backend --- front-end/login.html | 30 ++++++++++++++++++++++++++++++ front-end/register.html | 27 +++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 front-end/login.html create mode 100644 front-end/register.html diff --git a/front-end/login.html b/front-end/login.html new file mode 100644 index 0000000..425f5a7 --- /dev/null +++ b/front-end/login.html @@ -0,0 +1,30 @@ + + + + Login + + + +

Login

+ +{% if error %} +

{{ error }}

+{% endif %} + +
+ {% csrf_token %} + + +

+ + +

+ + +
+ +

Don't have an account?

+Register + + + \ No newline at end of file diff --git a/front-end/register.html b/front-end/register.html new file mode 100644 index 0000000..18bf32c --- /dev/null +++ b/front-end/register.html @@ -0,0 +1,27 @@ + + + + Register + + + +

Create Account

+ +
+ {% csrf_token %} + + +

+ + +

+ + + +
+ +
+Back to Login + + + \ No newline at end of file From 41376926c618b7b5bcee5665da0b6614a00231f2 Mon Sep 17 00:00:00 2001 From: zhangtingen Date: Wed, 18 Mar 2026 16:34:00 -0500 Subject: [PATCH 18/18] create preprocess function for AI token --- ai_payload.json | 103 ++++++++ preprocess/__init__.py | 26 +++ preprocess/chunking.py | 119 ++++++++++ preprocess/filters.py | 68 ++++++ preprocess/hashing.py | 15 ++ preprocess/io_read.py | 35 +++ preprocess/language.py | 105 +++++++++ preprocess/normalized_findings.py | 377 ++++++++++++++++++++++++++++++ preprocess/parse_python.py | 74 ++++++ preprocess/pipeline.py | 211 +++++++++++++++++ preprocess/signals.py | 93 ++++++++ run_preprocess_test.py | 80 +++++++ testquery.py | 2 +- 13 files changed, 1307 insertions(+), 1 deletion(-) create mode 100644 ai_payload.json create mode 100644 preprocess/__init__.py create mode 100644 preprocess/chunking.py create mode 100644 preprocess/filters.py create mode 100644 preprocess/hashing.py create mode 100644 preprocess/io_read.py create mode 100644 preprocess/language.py create mode 100644 preprocess/normalized_findings.py create mode 100644 preprocess/parse_python.py create mode 100644 preprocess/pipeline.py create mode 100644 preprocess/signals.py create mode 100644 run_preprocess_test.py diff --git a/ai_payload.json b/ai_payload.json new file mode 100644 index 0000000..92b3004 --- /dev/null +++ b/ai_payload.json @@ -0,0 +1,103 @@ +{ + "schema": "ai_payload_v1", + "project_id": "121d4d1ff944c1642e8901fe9689b26811561e8437fe55d499cf4a9708c67e7d", + "input_path": "/Users/zhangtingen/Downloads/V/testquery.py", + "input_type": "file", + "pipeline_version": "1.0.0", + "normalized_findings": [ + { + "finding_id": "F-001", + "issue_type": "database_access_heuristic", + "title": "Database Access Pattern Detected", + "severity": "low", + "confidence": "low", + "finding_status": "review_needed", + "source_file": "testquery.py", + "evidence": [ + { + "line": 0, + "category": "database_access", + "snippet": "heuristic" + } + ], + "analysis_limitations": [ + "Heuristic only; line may be 0 when matched on whole file." + ] + }, + { + "finding_id": "F-002", + "issue_type": "sql_execution_review", + "title": "SQL Execution Present — Review for Injection / Unsafe Queries", + "severity": "medium", + "confidence": "medium", + "finding_status": "review_needed", + "source_file": "testquery.py", + "evidence": [ + { + "line": 7, + "category": "sql_execution", + "snippet": "result = conn.execute(text(\"SELECT NOW();\"))" + }, + { + "line": 13, + "category": "sql_execution", + "snippet": "#conn.execute(text(\"INSERT INTO users(email, password_hash) VALUES ('test2@example.com', '1A2B3C');\"))" + }, + { + "line": 14, + "category": "sql_execution", + "snippet": "result = conn.execute(text(\"SELECT * FROM users;\"))" + } + ] + }, + { + "finding_id": "F-003", + "issue_type": "potential_sensitive_data_exposure_via_debug_output", + "title": "Potential Sensitive Data Exposure via Debug Output", + "severity": "medium", + "confidence": "medium", + "finding_status": "review_needed", + "source_file": "testquery.py", + "evidence": [ + { + "line": 8, + "category": "debug_output", + "snippet": "print(\"Connected! Server time:\", result.fetchone()[0])" + }, + { + "line": 10, + "category": "debug_output", + "snippet": "print(\"Connection failed:\", e)" + }, + { + "line": 15, + "category": "debug_output", + "snippet": "print(result.fetchall())" + } + ], + "analysis_limitations": [ + "Cannot determine if printed data contains sensitive fields without data flow analysis." + ] + }, + { + "finding_id": "F-004", + "issue_type": "broad_exception_handling", + "title": "Broad Exception Handling", + "severity": "low", + "confidence": "medium", + "finding_status": "review_needed", + "source_file": "testquery.py", + "evidence": [ + { + "line": 9, + "category": "broad_except", + "snippet": "except Exception as e:" + } + ] + } + ], + "meta": { + "finding_count": 4, + "note": "Full source lives in preprocess output only; fetch by file_id/chunk_id if needed." + } +} \ No newline at end of file diff --git a/preprocess/__init__.py b/preprocess/__init__.py new file mode 100644 index 0000000..3716dcb --- /dev/null +++ b/preprocess/__init__.py @@ -0,0 +1,26 @@ +""" +Backend preprocessing pipeline for secure code review. +Produces structured JSON (project/file/chunk) without frontend. +""" + +from preprocess.pipeline import run_file, run_snippet, PIPELINE_VERSION +from preprocess.normalized_findings import ( + normalize_preprocess_output, + normalize_file_findings, + run_file_with_findings, + export_ai_payload, + run_file_ai_payload, + slim_finding_for_ai, +) + +__all__ = [ + "run_file", + "run_snippet", + "PIPELINE_VERSION", + "normalize_preprocess_output", + "normalize_file_findings", + "run_file_with_findings", + "export_ai_payload", + "run_file_ai_payload", + "slim_finding_for_ai", +] diff --git a/preprocess/chunking.py b/preprocess/chunking.py new file mode 100644 index 0000000..9fd74b7 --- /dev/null +++ b/preprocess/chunking.py @@ -0,0 +1,119 @@ +""" +Build chunks: whole file, or per function/class, or line-based fallback. +""" + +from typing import Any, Dict, List + +MAX_LINES_WHOLE_CHUNK = 200 +LINE_CHUNK_SIZE = 120 +LINE_CHUNK_OVERLAP = 10 + + +def lines_to_content(lines: List[str], start: int, end: int) -> str: + """start/end are 1-based inclusive line numbers.""" + if start < 1: + start = 1 + if end > len(lines): + end = len(lines) + return "\n".join(lines[start - 1 : end]) + + +def build_chunks( + file_id: str, + content: str, + language: str, + structure: Dict[str, Any], +) -> List[Dict[str, Any]]: + """ + Returns list of chunk dicts with chunk_id, file_id, start_line, end_line, type, symbol, content. + chunk_id filled by caller after content_hash. + """ + lines = content.splitlines() + n = len(lines) + chunks: List[Dict[str, Any]] = [] + + if language == "python" and structure.get("parse_ok"): + # Prefer function/class spans + spans = [] + for f in structure.get("functions", []): + spans.append(("function", f["name"], f["line"], f.get("end_line", f["line"]))) + for c in structure.get("classes", []): + spans.append(("class", c["name"], c["line"], c.get("end_line", c["line"]))) + spans.sort(key=lambda x: x[2]) + if spans and n > MAX_LINES_WHOLE_CHUNK: + covered = set() + for typ, sym, start, end in spans: + if end < start: + end = start + chunk_content = lines_to_content(lines, start, end) + if not chunk_content.strip(): + continue + chunks.append({ + "file_id": file_id, + "start_line": start, + "end_line": end, + "type": typ, + "symbol": sym, + "content": chunk_content, + }) + for ln in range(start, end + 1): + covered.add(ln) + # Optional: add line-based for uncovered regions — keep simple: if no spans cover whole file, fallback + if not chunks: + pass + if not chunks and n <= MAX_LINES_WHOLE_CHUNK: + chunks.append({ + "file_id": file_id, + "start_line": 1, + "end_line": n, + "type": "file", + "symbol": None, + "content": content, + }) + elif not chunks: + chunks.extend(_line_chunks(file_id, lines)) + else: + if n <= MAX_LINES_WHOLE_CHUNK: + chunks.append({ + "file_id": file_id, + "start_line": 1, + "end_line": n, + "type": "file", + "symbol": None, + "content": content, + }) + else: + chunks.extend(_line_chunks(file_id, lines)) + + if not chunks and content.strip(): + chunks.append({ + "file_id": file_id, + "start_line": 1, + "end_line": max(n, 1), + "type": "file", + "symbol": None, + "content": content, + }) + return chunks + + +def _line_chunks(file_id: str, lines: List[str]) -> List[Dict[str, Any]]: + out = [] + n = len(lines) + i = 0 + while i < n: + start = i + 1 + end = min(i + LINE_CHUNK_SIZE, n) + block = "\n".join(lines[i:end]) + out.append({ + "file_id": file_id, + "start_line": start, + "end_line": end, + "type": "lines", + "symbol": None, + "content": block, + }) + i = end - LINE_CHUNK_OVERLAP if end - LINE_CHUNK_OVERLAP > i else end + if i >= n: + break + return out diff --git a/preprocess/filters.py b/preprocess/filters.py new file mode 100644 index 0000000..d828632 --- /dev/null +++ b/preprocess/filters.py @@ -0,0 +1,68 @@ +""" +Skip binary/large/minified paths and irrelevant directories. +""" + +from pathlib import Path +from typing import Optional, Tuple + +SKIP_DIR_NAMES = { + "node_modules", + "venv", + ".venv", + ".git", + "dist", + "build", + "__pycache__", + ".next", + "target", + ".idea", + ".vscode", +} + +BINARY_EXTENSIONS = { + ".png", ".jpg", ".jpeg", ".gif", ".webp", ".ico", ".bmp", + ".pdf", ".zip", ".tar", ".gz", ".7z", ".rar", + ".so", ".dll", ".dylib", ".exe", ".bin", + ".pyc", ".pyo", ".class", ".o", ".a", + ".woff", ".woff2", ".ttf", ".eot", + ".mp3", ".mp4", ".webm", ".avi", +} + +MINIFIED_NAME_SUFFIXES = (".min.js", ".min.css") +GENERATED_NAME_PARTS = ("-generated", "_generated", ".generated.") + + +def should_skip_path(path: Path, max_bytes: int = 512 * 1024) -> Tuple[bool, Optional[str]]: + """ + Returns (skip, reason). reason is None if not skipped. + """ + if not path.exists(): + return True, "not_found" + if path.is_dir(): + if path.name in SKIP_DIR_NAMES: + return True, f"skip_dir:{path.name}" + return False, None + suffix = path.suffix.lower() + if suffix in BINARY_EXTENSIONS: + return True, f"binary_ext:{suffix}" + try: + size = path.stat().st_size + except OSError: + return True, "stat_failed" + if size > max_bytes: + return True, f"too_large:{size}" + name = path.name.lower() + for s in MINIFIED_NAME_SUFFIXES: + if name.endswith(s): + return True, "minified_name" + for part in GENERATED_NAME_PARTS: + if part in name: + return True, "generated_name" + return False, None + + +def path_has_skip_segment(path: Path) -> bool: + for part in path.parts: + if part in SKIP_DIR_NAMES: + return True + return False diff --git a/preprocess/hashing.py b/preprocess/hashing.py new file mode 100644 index 0000000..b7e944c --- /dev/null +++ b/preprocess/hashing.py @@ -0,0 +1,15 @@ +import hashlib +import uuid +from pathlib import Path + + +def sha256_text(s: str) -> str: + return hashlib.sha256(s.encode("utf-8")).hexdigest() + + +def sha256_bytes(b: bytes) -> str: + return hashlib.sha256(b).hexdigest() + + +def new_uuid() -> str: + return str(uuid.uuid4()) diff --git a/preprocess/io_read.py b/preprocess/io_read.py new file mode 100644 index 0000000..6f0862d --- /dev/null +++ b/preprocess/io_read.py @@ -0,0 +1,35 @@ +""" +Read file as text: UTF-8 first, then fallbacks. Detect unreadable/binary. +""" + +from pathlib import Path +from typing import Optional, Tuple + +ENCODING_FALLBACKS = ["utf-8", "utf-8-sig", "latin-1", "cp1252"] + + +def read_text_with_fallback(path: Path) -> Tuple[Optional[str], Optional[str], Optional[str]]: + """ + Returns (text, encoding_used, error). + If unreadable, text is None and error explains why. + """ + raw = path.read_bytes() + if b"\x00" in raw[:8192] and raw[:8192].count(b"\x00") > 2: + return None, None, "likely_binary_null_bytes" + for enc in ENCODING_FALLBACKS: + try: + text = raw.decode(enc) + return text, enc, None + except UnicodeDecodeError: + continue + return None, None, "decode_failed_all_encodings" + + +def is_probably_text(s: str, sample_lines: int = 50) -> bool: + """Heuristic: too many non-printable chars => skip.""" + sample = "\n".join(s.splitlines()[:sample_lines]) + if not sample.strip(): + return True + printable = sum(1 for c in sample if c.isprintable() or c in "\n\r\t") + ratio = printable / max(len(sample), 1) + return ratio >= 0.85 diff --git a/preprocess/language.py b/preprocess/language.py new file mode 100644 index 0000000..0610b53 --- /dev/null +++ b/preprocess/language.py @@ -0,0 +1,105 @@ +""" +Language detection: extension first, then shebang/keywords/patterns. +""" + +import re +from pathlib import Path +from typing import Dict, Optional + +EXT_TO_LANGUAGE = { + ".py": "python", + ".js": "javascript", + ".mjs": "javascript", + ".ts": "typescript", + ".tsx": "typescript", + ".jsx": "javascript", + ".java": "java", + ".go": "go", + ".rb": "ruby", + ".php": "php", + ".cs": "csharp", + ".rs": "rust", + ".c": "c", + ".cpp": "cpp", + ".h": "c", + ".sql": "sql", + ".sh": "shell", + ".bash": "shell", + ".yaml": "yaml", + ".yml": "yaml", + ".json": "json", + ".html": "html", + ".css": "css", +} + +SHEBANG_PATTERN = re.compile(r"^#!\s*/usr/bin/env\s+(\w+)|^#!\s*/.*\b(python|node|ruby)\b", re.MULTILINE) + +KEYWORD_HINTS = [ + (r"\bdef\s+\w+\s*\(", "python"), + (r"\bimport\s+\w+", "python"), + (r"\bfrom\s+\w+\s+import\b", "python"), + (r"\bfunction\s+\w+\s*\(", "javascript"), + (r"\bconst\s+\w+\s*=\s*\(", "javascript"), + (r"\brequire\s*\(", "javascript"), + (r"\bpublic\s+class\s+\w+", "java"), + (r"\bpackage\s+main\b", "go"), + (r"\bfn\s+main\s*\(", "rust"), +] + + +def detect_language(path: Optional[Path], content: str, hint: Optional[str] = None) -> Dict: + """ + Returns { "language", "confidence": high|medium|low|unknown, "reason": str }. + """ + reasons = [] + lang_from_ext = None + if path and path.suffix: + lang_from_ext = EXT_TO_LANGUAGE.get(path.suffix.lower()) + if lang_from_ext: + reasons.append(f"extension:{path.suffix}") + + if hint: + h = hint.strip().lower() + if h in ("py", "python"): + lang_from_ext = lang_from_ext or "python" + reasons.append("hint:python") + elif h in ("js", "javascript"): + lang_from_ext = lang_from_ext or "javascript" + reasons.append("hint:javascript") + + lang_from_content = None + first_lines = "\n".join(content.splitlines()[:30]) + m = SHEBANG_PATTERN.search(first_lines) + if m: + g = (m.group(1) or m.group(2) or "").lower() + if "python" in g: + lang_from_content = "python" + elif "node" in g: + lang_from_content = "javascript" + elif "ruby" in g: + lang_from_content = "ruby" + if lang_from_content: + reasons.append("shebang") + + if not lang_from_content: + for pattern, lang in KEYWORD_HINTS: + if re.search(pattern, first_lines): + lang_from_content = lang + reasons.append(f"keyword:{pattern[:20]}") + break + + if lang_from_ext and lang_from_content: + if lang_from_ext == lang_from_content: + return {"language": lang_from_ext, "confidence": "high", "reason": ";".join(reasons)} + return { + "language": lang_from_ext, + "confidence": "medium", + "reason": f"extension_vs_content_conflict;{';'.join(reasons)}", + } + if lang_from_ext: + return {"language": lang_from_ext, "confidence": "medium", "reason": ";".join(reasons)} + if lang_from_content: + return {"language": lang_from_content, "confidence": "medium", "reason": ";".join(reasons)} + if path and path.suffix: + return {"language": "unknown", "confidence": "low", "reason": f"unknown_ext:{path.suffix}"} + return {"language": "unknown", "confidence": "unknown", "reason": "no_extension_no_keywords"} diff --git a/preprocess/normalized_findings.py b/preprocess/normalized_findings.py new file mode 100644 index 0000000..edef5ad --- /dev/null +++ b/preprocess/normalized_findings.py @@ -0,0 +1,377 @@ +""" +Normalize preprocess signals into structured findings for AI / report stage. +No human layer: finding_status is review_needed; confidence reflects heuristic only. +""" + +from typing import Any, Dict, List, Optional + +# Signal bucket key (as in file_record["signals"]) -> default issue template +# category inside each item is the regex category e.g. broad_except, debug_output +SIGNAL_BUCKET_META = { + "error_handling": { + "issue_type": "broad_exception_handling", + "title": "Broad Exception Handling", + "affected_component": ["backend", "error_handling"], + "confidence": "medium", + "severity": "low", + "remediation_keywords": [ + "catch specific exceptions", + "structured logging", + "safer exception handling", + ], + "possible_impact": [ + "reduced error observability", + "generic failure handling may hide root causes", + ], + }, + "debug_output": { + "issue_type": "potential_sensitive_data_exposure_via_debug_output", + "title": "Potential Sensitive Data Exposure via Debug Output", + "affected_component": ["backend", "logging"], + "confidence": "medium", + "severity": "medium", + "remediation_keywords": [ + "remove debug prints", + "sanitize logs", + "avoid dumping full query results", + ], + "possible_impact": [ + "internal data may appear in console or logs", + "debug output could leak in production", + ], + "analysis_limitations": [ + "Cannot determine if printed data contains sensitive fields without data flow analysis.", + ], + }, + "sql_execution": { + "issue_type": "sql_execution_review", + "title": "SQL Execution Present — Review for Injection / Unsafe Queries", + "affected_component": ["backend", "database"], + "confidence": "medium", + "severity": "medium", + "remediation_keywords": [ + "parameterized queries", + "prepared statements", + "avoid string concatenation in SQL", + ], + "possible_impact": [ + "SQL injection if input is concatenated into queries", + "unsafe query patterns", + ], + }, + "command_execution": { + "issue_type": "command_execution_surface", + "title": "Command Execution or Dynamic Evaluation", + "affected_component": ["backend", "process"], + "confidence": "high", + "severity": "high", + "remediation_keywords": [ + "avoid eval/exec", + "sanitize subprocess arguments", + "use allowlists for shell commands", + ], + "possible_impact": [ + "command injection", + "arbitrary code execution", + ], + }, + "file_access": { + "issue_type": "file_io_review", + "title": "File Access — Review Path Handling", + "affected_component": ["backend", "filesystem"], + "confidence": "low", + "severity": "low", + "remediation_keywords": [ + "validate paths", + "avoid path traversal", + ], + "possible_impact": [ + "path traversal if paths are user-controlled", + ], + }, + "user_input_sources": { + "issue_type": "user_input_flow", + "title": "User-Controlled Input Source", + "affected_component": ["backend", "input"], + "confidence": "medium", + "severity": "medium", + "remediation_keywords": [ + "validate and sanitize input", + "use safe APIs", + ], + "possible_impact": [ + "injection or logic flaws if input reaches sensitive sinks", + ], + }, + "possible_hardcoded_secrets": { + "issue_type": "possible_hardcoded_secret", + "title": "Possible Hardcoded Secret", + "affected_component": ["backend", "secrets"], + "confidence": "low", + "severity": "high", + "remediation_keywords": [ + "use environment variables or secret manager", + "rotate credentials", + ], + "possible_impact": [ + "credential leak if committed or logged", + ], + "analysis_limitations": [ + "May be test data or placeholders; verify context.", + ], + }, + "auth_related_keywords": { + "issue_type": "auth_surface_keyword", + "title": "Auth-Related Keyword Present", + "affected_component": ["backend", "auth"], + "confidence": "low", + "severity": "low", + "remediation_keywords": [ + "review auth flow", + "session handling", + ], + "possible_impact": [ + "auth logic may need manual review", + ], + }, + "database_access": { + "issue_type": "database_access_heuristic", + "title": "Database Access Pattern Detected", + "affected_component": ["backend", "database"], + "confidence": "low", + "severity": "low", + "remediation_keywords": [ + "least privilege", + "connection pooling security", + ], + "possible_impact": [ + "review how connections and queries are used", + ], + "analysis_limitations": [ + "Heuristic only; line may be 0 when matched on whole file.", + ], + }, + "crypto_usage": { + "issue_type": "crypto_usage_review", + "title": "Cryptographic API Usage", + "affected_component": ["backend", "crypto"], + "confidence": "low", + "severity": "medium", + "remediation_keywords": [ + "use vetted libraries", + "avoid weak algorithms", + ], + "possible_impact": [ + "misuse may weaken security", + ], + }, +} + + +def _evidence_from_signal_item(signal_key: str, index: int, item: Dict[str, Any]) -> Dict[str, Any]: + line = item.get("line", 0) + category = item.get("category", signal_key) + text = item.get("match", "") + if len(text) > 500: + text = text[:500] + "..." + return { + "line": line, + "signal": signal_key, + "category": category, + "text": text, + } + + +def _refs_for_bucket(signal_key: str, indices: List[int]) -> List[str]: + return [f"signals.{signal_key}[{i}]" for i in indices] + + +def normalize_file_findings( + file_record: Dict[str, Any], + file_index: int = 0, + starting_f_index: int = 1, +) -> List[Dict[str, Any]]: + """ + Build normalized_findings from a single preprocess file_record. + One finding per signal bucket that has entries (evidence = all items in bucket). + """ + signals = file_record.get("signals") or {} + path = file_record.get("path", f"file_{file_index}") + findings: List[Dict[str, Any]] = [] + f_num = starting_f_index + + for signal_key, items in signals.items(): + if not items: + continue + meta = SIGNAL_BUCKET_META.get(signal_key) + if not meta: + # Unknown bucket: generic finding + meta = { + "issue_type": f"heuristic_{signal_key}", + "title": f"Signal: {signal_key}", + "affected_component": ["backend"], + "confidence": "low", + "severity": "low", + "remediation_keywords": ["manual review"], + "possible_impact": ["pattern matched; context unknown"], + } + evidence = [] + refs = [] + for i, item in enumerate(items): + if not isinstance(item, dict): + continue + evidence.append(_evidence_from_signal_item(signal_key, i, item)) + refs.append(f"signals.{signal_key}[{i}]") + + if not evidence: + continue + + finding = { + "finding_id": f"F-{f_num:03d}", + "issue_type": meta["issue_type"], + "title": meta["title"], + "affected_component": list(meta["affected_component"]), + "confidence": meta["confidence"], + "severity": meta["severity"], + "finding_status": "review_needed", + "verification_method": "automated_heuristic", + "source_file": path, + "file_id": file_record.get("file_id"), + "possible_impact": list(meta.get("possible_impact", [])), + "evidence": evidence, + "remediation_keywords": list(meta.get("remediation_keywords", [])), + "source_signal_refs": refs, + } + if meta.get("analysis_limitations"): + finding["analysis_limitations"] = list(meta["analysis_limitations"]) + findings.append(finding) + f_num += 1 + + return findings + + +def normalize_preprocess_output(preprocess_output: Dict[str, Any]) -> Dict[str, Any]: + """ + Attach normalized_findings to full preprocess pipeline output. + """ + all_findings: List[Dict[str, Any]] = [] + f_num = 1 + for idx, file_record in enumerate(preprocess_output.get("files") or []): + batch = normalize_file_findings(file_record, file_index=idx, starting_f_index=f_num) + all_findings.extend(batch) + f_num += len(batch) + + return { + "pipeline_version": preprocess_output.get("pipeline_version"), + "project_id": preprocess_output.get("project_id"), + "created_at": preprocess_output.get("created_at"), + "input_type": preprocess_output.get("input_type"), + "input_path": preprocess_output.get("input_path"), + "normalized_findings": all_findings, + "normalized_findings_meta": { + "count": len(all_findings), + "generator": "preprocess.normalized_findings", + "note": "All findings are heuristic; finding_status is review_needed unless overridden downstream.", + }, + } + + +def slim_finding_for_ai( + finding: Dict[str, Any], + max_evidence_snippet: int = 120, + include_remediation: bool = False, +) -> Dict[str, Any]: + """ + Shrink one finding for LLM context: drop long text, optional remediation lists. + """ + out = { + "finding_id": finding.get("finding_id"), + "issue_type": finding.get("issue_type"), + "title": finding.get("title"), + "severity": finding.get("severity"), + "confidence": finding.get("confidence"), + "finding_status": finding.get("finding_status"), + "source_file": finding.get("source_file"), + } + slim_evidence = [] + for ev in finding.get("evidence") or []: + text = ev.get("text") or "" + if len(text) > max_evidence_snippet: + text = text[:max_evidence_snippet].rstrip() + "..." + slim_evidence.append({ + "line": ev.get("line"), + "category": ev.get("category"), + "snippet": text if text else None, + }) + out["evidence"] = slim_evidence + if include_remediation and finding.get("remediation_keywords"): + out["remediation_keywords"] = finding.get("remediation_keywords") + if finding.get("analysis_limitations"): + # Keep one line each to save tokens + out["analysis_limitations"] = [ + (s[:150] + "...") if len(s) > 150 else s + for s in finding.get("analysis_limitations", [])[:2] + ] + return out + + +def export_ai_payload( + preprocess_output: Dict[str, Any], + max_evidence_snippet: int = 120, +) -> Dict[str, Any]: + """ + Compact payload for AI only — no chunk content, no raw signals. + Preprocess full JSON stays separate (long); this is short for token budget. + """ + normalized = normalize_preprocess_output(preprocess_output) + slim_findings = [ + slim_finding_for_ai(f, max_evidence_snippet=max_evidence_snippet) + for f in normalized.get("normalized_findings") or [] + ] + return { + "schema": "ai_payload_v1", + "project_id": preprocess_output.get("project_id"), + "input_path": preprocess_output.get("input_path"), + "input_type": preprocess_output.get("input_type"), + "pipeline_version": preprocess_output.get("pipeline_version"), + "normalized_findings": slim_findings, + "meta": { + "finding_count": len(slim_findings), + "note": "Full source lives in preprocess output only; fetch by file_id/chunk_id if needed.", + }, + } + + +def run_file_ai_payload(path: str, max_evidence_snippet: int = 120) -> Dict[str, Any]: + """Preprocess then return only slim AI payload (no chunks, no duplicate signals).""" + from preprocess.pipeline import run_file + + pre = run_file(path) + if not pre.get("files"): + return { + "schema": "ai_payload_v1", + "project_id": pre.get("project_id"), + "input_path": pre.get("input_path"), + "normalized_findings": [], + "meta": {"finding_count": 0, "reason": "no files processed"}, + } + return export_ai_payload(pre, max_evidence_snippet=max_evidence_snippet) + + +def run_file_with_findings(path: str) -> Dict[str, Any]: + """Convenience: preprocess file then normalize (single import for CLI).""" + from preprocess.pipeline import run_file + + pre = run_file(path) + if not pre.get("files"): + return { + **pre, + "normalized_findings": [], + "normalized_findings_meta": {"count": 0, "reason": "no files processed"}, + } + normalized = normalize_preprocess_output(pre) + # Merge: keep full preprocess payload + normalized_findings at top level for AI + out = dict(pre) + out["normalized_findings"] = normalized["normalized_findings"] + out["normalized_findings_meta"] = normalized["normalized_findings_meta"] + return out diff --git a/preprocess/parse_python.py b/preprocess/parse_python.py new file mode 100644 index 0000000..b663297 --- /dev/null +++ b/preprocess/parse_python.py @@ -0,0 +1,74 @@ +""" +Extract imports, functions, classes from Python source via ast. +On failure returns empty structure for fallback chunking. +""" + +import ast +from typing import Any, Dict, List, Optional + + +def parse_python_structure(source: str) -> Dict[str, Any]: + out: Dict[str, Any] = { + "parse_ok": False, + "imports": [], + "functions": [], + "classes": [], + "calls": [], # simplified: names only, from ast.Call if possible + } + try: + tree = ast.parse(source) + except SyntaxError: + return out + out["parse_ok"] = True + for node in ast.walk(tree): + if isinstance(node, ast.Import): + for alias in node.names: + out["imports"].append({ + "line": node.lineno, + "end_line": getattr(node, "end_lineno", node.lineno), + "name": alias.name, + "alias": alias.asname, + }) + elif isinstance(node, ast.ImportFrom): + mod = node.module or "" + for alias in node.names: + out["imports"].append({ + "line": node.lineno, + "end_line": getattr(node, "end_lineno", node.lineno), + "name": f"{mod}.{alias.name}" if mod else alias.name, + "alias": alias.asname, + }) + elif isinstance(node, ast.FunctionDef) or isinstance(node, ast.AsyncFunctionDef): + end = getattr(node, "end_lineno", node.lineno) + out["functions"].append({ + "name": node.name, + "line": node.lineno, + "end_line": end, + }) + elif isinstance(node, ast.ClassDef): + end = getattr(node, "end_lineno", node.lineno) + out["classes"].append({ + "name": node.name, + "line": node.lineno, + "end_line": end, + }) + elif isinstance(node, ast.Call): + name = _call_name(node) + if name: + out["calls"].append({"line": node.lineno, "name": name}) + return out + + +def _call_name(node: ast.Call) -> Optional[str]: + if isinstance(node.func, ast.Name): + return node.func.id + if isinstance(node.func, ast.Attribute): + parts: List[str] = [] + cur = node.func + while isinstance(cur, ast.Attribute): + parts.append(cur.attr) + cur = cur.value + if isinstance(cur, ast.Name): + parts.append(cur.id) + return ".".join(reversed(parts)) + return None diff --git a/preprocess/pipeline.py b/preprocess/pipeline.py new file mode 100644 index 0000000..dfe9e5f --- /dev/null +++ b/preprocess/pipeline.py @@ -0,0 +1,211 @@ +""" +Orchestrates preprocessing: file or snippet -> structured JSON. +""" + +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, Dict, Optional, Union + +from preprocess.chunking import build_chunks +from preprocess.filters import should_skip_path, path_has_skip_segment +from preprocess.hashing import sha256_text +from preprocess.io_read import read_text_with_fallback, is_probably_text +from preprocess.language import detect_language +from preprocess.parse_python import parse_python_structure +from preprocess.signals import extract_signals, risk_hints_from_signals + +PIPELINE_VERSION = "1.0.0" + + +def _file_id(project_id: str, rel_path: str, content: str) -> str: + return sha256_text(f"{project_id}:{rel_path}:{sha256_text(content)}") + + +def _chunk_id(file_id: str, start: int, end: int, content: str) -> str: + return sha256_text(f"{file_id}:{start}:{end}:{sha256_text(content)}") + + +def run_file( + path: Union[str, Path], + language_hint: Optional[str] = None, + max_file_bytes: int = 512 * 1024, +) -> Dict[str, Any]: + path = Path(path).resolve() + skip, reason = should_skip_path(path, max_bytes=max_file_bytes) + project_id = sha256_text(str(path)) + if skip: + return { + "pipeline_version": PIPELINE_VERSION, + "project_id": project_id, + "created_at": datetime.now(timezone.utc).isoformat(), + "input_type": "file", + "input_path": str(path), + "files": [], + "files_skipped": [{"path": str(path), "reason": reason}], + "chunks": [], + } + if path_has_skip_segment(path): + return { + "pipeline_version": PIPELINE_VERSION, + "project_id": project_id, + "created_at": datetime.now(timezone.utc).isoformat(), + "input_type": "file", + "input_path": str(path), + "files": [], + "files_skipped": [{"path": str(path), "reason": "path_in_skip_dir"}], + "chunks": [], + } + + text, encoding, err = read_text_with_fallback(path) + if text is None or err: + return { + "pipeline_version": PIPELINE_VERSION, + "project_id": project_id, + "created_at": datetime.now(timezone.utc).isoformat(), + "input_type": "file", + "input_path": str(path), + "files": [], + "files_skipped": [{"path": str(path), "reason": err or "read_failed"}], + "chunks": [], + } + if not is_probably_text(text): + return { + "pipeline_version": PIPELINE_VERSION, + "project_id": project_id, + "created_at": datetime.now(timezone.utc).isoformat(), + "input_type": "file", + "input_path": str(path), + "files": [], + "files_skipped": [{"path": str(path), "reason": "not_probably_text"}], + "chunks": [], + } + + rel_path = path.name + lang_info = detect_language(path, text, language_hint) + structure: Dict[str, Any] = {} + if lang_info["language"] == "python": + structure = parse_python_structure(text) + + file_id = _file_id(project_id, rel_path, text) + content_hash = sha256_text(text) + + raw_chunks = build_chunks(file_id, text, lang_info["language"], structure) + chunks_out = [] + signals_full = extract_signals(text) + risk_hints_file = risk_hints_from_signals(signals_full) + + for ch in raw_chunks: + cid = _chunk_id(file_id, ch["start_line"], ch["end_line"], ch["content"]) + chunk_signals = extract_signals(ch["content"]) + chunks_out.append({ + "chunk_id": cid, + "file_id": file_id, + "start_line": ch["start_line"], + "end_line": ch["end_line"], + "type": ch["type"], + "symbol": ch["symbol"], + "content": ch["content"], + "content_hash": sha256_text(ch["content"]), + "signals": {k: v for k, v in chunk_signals.items() if v}, + "risk_hints": risk_hints_from_signals(chunk_signals), + }) + + file_record = { + "file_id": file_id, + "path": rel_path, + "language": lang_info["language"], + "language_confidence": lang_info["confidence"], + "language_reason": lang_info["reason"], + "encoding_used": encoding, + "line_count": len(text.splitlines()), + "byte_size": path.stat().st_size, + "content_hash": content_hash, + "parse_ok": structure.get("parse_ok", False), + "imports": structure.get("imports", []), + "functions": structure.get("functions", []), + "classes": structure.get("classes", []), + "calls_sample": structure.get("calls", [])[:50], + "signals": {k: v for k, v in signals_full.items() if v}, + "risk_hints": risk_hints_file, + } + + return { + "pipeline_version": PIPELINE_VERSION, + "project_id": project_id, + "created_at": datetime.now(timezone.utc).isoformat(), + "input_type": "file", + "input_path": str(path), + "files": [file_record], + "files_skipped": [], + "chunks": chunks_out, + } + + +def run_snippet( + content: str, + virtual_name: str = "snippet.py", + language_hint: Optional[str] = None, +) -> Dict[str, Any]: + """Treat snippet as a single virtual file.""" + path = Path(virtual_name) + project_id = sha256_text(content[:5000] + virtual_name) + lang_info = detect_language(path, content, language_hint) + structure = {} + if lang_info["language"] == "python": + structure = parse_python_structure(content) + file_id = _file_id(project_id, virtual_name, content) + raw_chunks = build_chunks(file_id, content, lang_info["language"], structure) + signals_full = extract_signals(content) + chunks_out = [] + for ch in raw_chunks: + cid = _chunk_id(file_id, ch["start_line"], ch["end_line"], ch["content"]) + chunk_signals = extract_signals(ch["content"]) + chunks_out.append({ + "chunk_id": cid, + "file_id": file_id, + "start_line": ch["start_line"], + "end_line": ch["end_line"], + "type": ch["type"], + "symbol": ch["symbol"], + "content": ch["content"], + "content_hash": sha256_text(ch["content"]), + "signals": {k: v for k, v in chunk_signals.items() if v}, + "risk_hints": risk_hints_from_signals(chunk_signals), + }) + file_record = { + "file_id": file_id, + "path": virtual_name, + "language": lang_info["language"], + "language_confidence": lang_info["confidence"], + "language_reason": lang_info["reason"], + "encoding_used": "utf-8", + "line_count": len(content.splitlines()), + "byte_size": len(content.encode("utf-8")), + "content_hash": sha256_text(content), + "parse_ok": structure.get("parse_ok", False), + "imports": structure.get("imports", []), + "functions": structure.get("functions", []), + "classes": structure.get("classes", []), + "calls_sample": structure.get("calls", [])[:50], + "signals": {k: v for k, v in signals_full.items() if v}, + "risk_hints": risk_hints_from_signals(signals_full), + } + return { + "pipeline_version": PIPELINE_VERSION, + "project_id": project_id, + "created_at": datetime.now(timezone.utc).isoformat(), + "input_type": "snippet", + "input_path": virtual_name, + "files": [file_record], + "files_skipped": [], + "chunks": chunks_out, + } + + +def run_file_to_json(path: Union[str, Path], out_path: Optional[str] = None) -> str: + data = run_file(path) + s = json.dumps(data, ensure_ascii=False, indent=2) + if out_path: + Path(out_path).write_text(s, encoding="utf-8") + return s diff --git a/preprocess/signals.py b/preprocess/signals.py new file mode 100644 index 0000000..094f8c3 --- /dev/null +++ b/preprocess/signals.py @@ -0,0 +1,93 @@ +""" +Regex-based security signals and risk_hints from source text. +""" + +import re +from typing import Dict, List, Set + +# Patterns -> signal category +PATTERNS = [ + (r"\bsubprocess\.(run|Popen|call)\b", "command_execution"), + (r"\bos\.system\s*\(", "command_execution"), + (r"\bexec\s*\(", "command_execution"), + (r"\beval\s*\(", "command_execution"), + (r"\.execute\s*\(\s*[\"']", "sql_execution"), + (r"\bcursor\.execute\s*\(", "sql_execution"), + (r"\btext\s*\(\s*[\"'].*SELECT|INSERT|UPDATE|DELETE", "sql_execution"), + (r"\bopen\s*\(", "file_access"), + (r"\bPath\s*\([^)]*\)\.(read|write)", "file_access"), + (r"\binput\s*\(", "user_input_source"), + (r"\brequest\.(args|form|json|get)\b", "user_input_source"), + (r"\bargv\b|\bsys\.argv\b", "user_input_source"), + (r"\bhashlib\.|bcrypt\.|crypto\.|jwt\.|openssl\b", "crypto_usage"), + (r"\bprint\s*\(", "debug_output"), + (r"\bconsole\.log\s*\(", "debug_output"), + (r"\bexcept\s+Exception\b|\bexcept\s*:", "broad_except"), + (r"(?i)(password|secret|api_key|apikey|token)\s*=\s*['\"][^'\"]{8,}", "possible_hardcoded_secret"), + (r"(?i)\b(auth|login|session|oauth|bearer)\b", "auth_keyword"), +] + + +def extract_signals(content: str) -> Dict[str, List[Dict]]: + """ + Returns dict with keys: imports (from regex), function_calls, user_input_sources, + database_access, sql_execution, command_execution, file_access, crypto_usage, + debug_output, error_handling, possible_hardcoded_secrets, auth_related_keywords. + Each value is a list of { "line", "match", "category" }. + """ + lines = content.splitlines() + by_category: Dict[str, List[Dict]] = { + "imports": [], + "function_calls": [], + "user_input_sources": [], + "database_access": [], + "sql_execution": [], + "command_execution": [], + "file_access": [], + "crypto_usage": [], + "debug_output": [], + "error_handling": [], + "possible_hardcoded_secrets": [], + "auth_related_keywords": [], + } + category_map = { + "command_execution": "command_execution", + "sql_execution": "sql_execution", + "file_access": "file_access", + "user_input_source": "user_input_sources", + "crypto_usage": "crypto_usage", + "debug_output": "debug_output", + "broad_except": "error_handling", + "possible_hardcoded_secret": "possible_hardcoded_secrets", + "auth_keyword": "auth_related_keywords", + } + for i, line in enumerate(lines, start=1): + for pattern, cat in PATTERNS: + if re.search(pattern, line): + key = category_map.get(cat, cat) + if key not in by_category: + by_category[key] = [] + by_category[key].append({"line": i, "match": line.strip()[:200], "category": cat}) + # DB access heuristic + if re.search(r"\bengine\.connect\b|\bconnection\b|\bdatabase\b", content, re.I): + by_category["database_access"].append({"line": 0, "match": "heuristic", "category": "database_access"}) + return by_category + + +def risk_hints_from_signals(signals: Dict[str, List]) -> List[str]: + hints: Set[str] = set() + if signals.get("sql_execution"): + hints.add("sql_execution_present") + if signals.get("command_execution"): + hints.add("command_execution_present") + if signals.get("possible_hardcoded_secrets"): + hints.add("possible_hardcoded_secret") + if signals.get("user_input_sources"): + hints.add("user_input_flow") + if signals.get("file_access"): + hints.add("file_io") + if signals.get("broad_except") or signals.get("error_handling"): + hints.add("error_handling_review") + if signals.get("auth_related_keywords"): + hints.add("auth_surface") + return sorted(hints) diff --git a/run_preprocess_test.py b/run_preprocess_test.py new file mode 100644 index 0000000..6709079 --- /dev/null +++ b/run_preprocess_test.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +Run preprocessing and write JSON. + +By default writes the LLM-friendly payload only (compact, no chunk content). +Use --full for complete preprocess output (scanners / cache). + +Usage: + python3 run_preprocess_test.py # -> ai_payload.json (LLM) + python3 run_preprocess_test.py -o out.json # LLM payload to out.json + python3 run_preprocess_test.py --full # full preprocess JSON + python3 run_preprocess_test.py --full -o pre.json + python3 run_preprocess_test.py other.py -o x.json +""" + +import argparse +import json +import sys +from pathlib import Path + +ROOT = Path(__file__).resolve().parent +sys.path.insert(0, str(ROOT)) + +from preprocess.pipeline import run_file # noqa: E402 +from preprocess.normalized_findings import run_file_ai_payload # noqa: E402 + +DEFAULT_INPUT = ROOT / "testquery.py" +DEFAULT_OUTPUT_LLM = "ai_payload.json" +DEFAULT_OUTPUT_FULL = "preprocess_output.json" + + +def main(): + parser = argparse.ArgumentParser( + description="Preprocess a file -> JSON (default: LLM payload only)", + ) + parser.add_argument( + "input", + nargs="?", + default=str(DEFAULT_INPUT), + help="File path to preprocess (default: testquery.py)", + ) + parser.add_argument( + "-o", "--output", + default=None, + help="Output JSON path (default: ai_payload.json or preprocess_output.json with --full)", + ) + parser.add_argument( + "--full", + action="store_true", + help="Write full preprocess JSON (includes chunk content; for scanners, not for LLM)", + ) + args = parser.parse_args() + + input_path = Path(args.input) + if not input_path.is_file(): + print(f"Not a file: {input_path}", file=sys.stderr) + sys.exit(1) + + if args.full: + data = run_file(input_path) + out = args.output or DEFAULT_OUTPUT_FULL + else: + data = run_file_ai_payload(input_path) + out = args.output or DEFAULT_OUTPUT_LLM + + out_path = Path(out) + out_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"Wrote {out_path}") + print(f" project_id: {data.get('project_id', '')[:16]}...") + + if args.full: + print(f" mode: full preprocess") + print(f" files: {len(data['files'])}, chunks: {len(data['chunks'])}, skipped: {len(data['files_skipped'])}") + else: + n = len(data.get("normalized_findings") or []) + print(f" mode: LLM payload ({n} findings, no chunk content)") + + +if __name__ == "__main__": + main() diff --git a/testquery.py b/testquery.py index 80e1505..18cd9e2 100644 --- a/testquery.py +++ b/testquery.py @@ -12,4 +12,4 @@ with engine.connect() as conn: #conn.execute(text("INSERT INTO users(email, password_hash) VALUES ('test2@example.com', '1A2B3C');")) result = conn.execute(text("SELECT * FROM users;")) - print(result.fetchall()) \ No newline at end of file + print(result.fetchall())