From e6d49b13a3908a03a0da08eee535cfe841f03658 Mon Sep 17 00:00:00 2001 From: Dev-Jeff28 <71960243+Dev-Jeff28@users.noreply.github.com> Date: Sat, 25 Oct 2025 09:22:51 +0530 Subject: [PATCH 1/5] Create .gitkeep --- Data_Science/streamlit_ml_app/.gitkeep | 1 + 1 file changed, 1 insertion(+) create mode 100644 Data_Science/streamlit_ml_app/.gitkeep diff --git a/Data_Science/streamlit_ml_app/.gitkeep b/Data_Science/streamlit_ml_app/.gitkeep new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/Data_Science/streamlit_ml_app/.gitkeep @@ -0,0 +1 @@ + From 1bb6c4978c682b3d5b695c081487a7bf0c7621b8 Mon Sep 17 00:00:00 2001 From: Dev-Jeff28 <71960243+Dev-Jeff28@users.noreply.github.com> Date: Sat, 25 Oct 2025 09:23:39 +0530 Subject: [PATCH 2/5] Add files via upload --- Data_Science/streamlit_ml_app/Readme.md | 26 ++++ .../streamlit_ml_app/requirements.txt | 7 + .../streamlit_ml_app/streamlit_app.py | 140 ++++++++++++++++++ 3 files changed, 173 insertions(+) create mode 100644 Data_Science/streamlit_ml_app/Readme.md create mode 100644 Data_Science/streamlit_ml_app/requirements.txt create mode 100644 Data_Science/streamlit_ml_app/streamlit_app.py diff --git a/Data_Science/streamlit_ml_app/Readme.md b/Data_Science/streamlit_ml_app/Readme.md new file mode 100644 index 0000000..0194ffe --- /dev/null +++ b/Data_Science/streamlit_ml_app/Readme.md @@ -0,0 +1,26 @@ +R# 🧠 Interactive Streamlit ML App + +A small Streamlit web application that allows users to **upload a CSV file**, **choose a machine learning model**, and **view model performance metrics interactively**. + +This fulfills the GitHub issue: + +> **"Build a small Streamlit app in `/Data_Science/` that allows uploading a CSV, choosing a model, and viewing metrics interactively."** + +--- + +## 🚀 Features + +- 📂 Upload any CSV dataset +- 🎯 Select your **target (label) column** +- ⚙️ Choose from built-in ML models (Logistic Regression, Decision Tree, Random Forest, etc.) +- 📊 View metrics such as Accuracy, Precision, Recall, F1-Score, and Confusion Matrix +- 🧩 Adjustable train-test split and random seed +- 🧰 Works for both **classification** and **regression** + +--- + +## 🧰 Installation & Requirements +pip install -r requirements.txt + +▶️ Running the App +streamlit run streamlit_app.py diff --git a/Data_Science/streamlit_ml_app/requirements.txt b/Data_Science/streamlit_ml_app/requirements.txt new file mode 100644 index 0000000..f12f2d8 --- /dev/null +++ b/Data_Science/streamlit_ml_app/requirements.txt @@ -0,0 +1,7 @@ +streamlit>=1.20 +pandas +scikit-learn +matplotlib +seaborn +numpy +plotly diff --git a/Data_Science/streamlit_ml_app/streamlit_app.py b/Data_Science/streamlit_ml_app/streamlit_app.py new file mode 100644 index 0000000..607d221 --- /dev/null +++ b/Data_Science/streamlit_ml_app/streamlit_app.py @@ -0,0 +1,140 @@ +# data_science/streamlit_app.py +import streamlit as st +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import StandardScaler +from sklearn.impute import SimpleImputer +from sklearn.pipeline import Pipeline +from sklearn.linear_model import LogisticRegression +from sklearn.ensemble import RandomForestClassifier +from sklearn.dummy import DummyClassifier +from sklearn.metrics import ( + accuracy_score, precision_score, recall_score, f1_score, + confusion_matrix, classification_report, roc_auc_score, roc_curve +) +import matplotlib.pyplot as plt +import seaborn as sns + +st.set_page_config(page_title="Data Science Demo", layout="wide") + +st.title("Small Streamlit Data Science App") +st.markdown("Upload a CSV, pick the target column, choose a model, and view metrics.") + +uploaded = st.file_uploader("Upload a CSV file", type=["csv"]) +if uploaded is None: + st.info("Upload a CSV to get started. Example: a classification dataset with a target column.") + st.stop() + +# read csv +df = pd.read_csv(uploaded) +st.write("### Preview of uploaded data", df.head()) + +# choose target +all_columns = df.columns.tolist() +target = st.selectbox("Select target column (label)", options=all_columns) + +# simple features selection: drop non-numeric by default but allow user to choose +st.write("Select feature columns (default: numeric columns excluding target)") +numeric = df.select_dtypes(include=[np.number]).columns.tolist() +default_features = [c for c in numeric if c != target] +features = st.multiselect("Features", options=all_columns, default=default_features) + +if len(features) == 0: + st.error("Please select at least one feature column.") + st.stop() + +# task type detection (very naive) +unique_vals = df[target].nunique() +task_type = "classification" if unique_vals <= 20 else "regression (not implemented)" +st.write(f"Detected: **{task_type}** (unique labels: {unique_vals})") + +if task_type != "classification": + st.warning("This demo only supports classification. Choose a categorical/binary target.") + st.stop() + +# train/test split params +test_size = st.sidebar.slider("Test size (%)", min_value=10, max_value=50, value=25) / 100.0 +random_state = st.sidebar.number_input("Random state", min_value=0, max_value=9999, value=42) + +# model selection +model_name = st.selectbox("Choose model", ["Logistic Regression", "Random Forest", "Baseline Dummy"]) +if model_name == "Logistic Regression": + model = LogisticRegression(max_iter=1000) +elif model_name == "Random Forest": + model = RandomForestClassifier(n_estimators=100, random_state=random_state) +else: + model = DummyClassifier(strategy="most_frequent") + +# prepare data +X = df[features].copy() +y = df[target].copy() + +# basic imputing and scaling pipeline +pipeline = Pipeline([ + ("imputer", SimpleImputer(strategy="mean")), + ("scaler", StandardScaler()), + ("clf", model) +]) + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y) + +with st.spinner("Training model..."): + pipeline.fit(X_train, y_train) + +y_pred = pipeline.predict(X_test) +metrics = { + "accuracy": accuracy_score(y_test, y_pred), + "precision_macro": precision_score(y_test, y_pred, average="macro", zero_division=0), + "recall_macro": recall_score(y_test, y_pred, average="macro", zero_division=0), + "f1_macro": f1_score(y_test, y_pred, average="macro", zero_division=0) +} + +st.subheader("Metrics") +col1, col2, col3, col4 = st.columns(4) +col1.metric("Accuracy", f"{metrics['accuracy']:.4f}") +col2.metric("Precision (macro)", f"{metrics['precision_macro']:.4f}") +col3.metric("Recall (macro)", f"{metrics['recall_macro']:.4f}") +col4.metric("F1 (macro)", f"{metrics['f1_macro']:.4f}") + +st.subheader("Classification report") +st.text(classification_report(y_test, y_pred, zero_division=0)) + +st.subheader("Confusion matrix") +cm = confusion_matrix(y_test, y_pred) +fig, ax = plt.subplots() +sns.heatmap(cm, annot=True, fmt="d", ax=ax) +ax.set_xlabel("Predicted") +ax.set_ylabel("Actual") +st.pyplot(fig) + +# ROC AUC for binary problems +if len(np.unique(y_test)) == 2: + try: + y_score = pipeline.predict_proba(X_test)[:, 1] + auc = roc_auc_score(y_test, y_score) + st.write(f"ROC AUC: **{auc:.4f}**") + fpr, tpr, _ = roc_curve(y_test, y_score) + fig2, ax2 = plt.subplots() + ax2.plot(fpr, tpr) + ax2.plot([0,1],[0,1],"--") + ax2.set_xlabel("FPR") + ax2.set_ylabel("TPR") + ax2.set_title("ROC curve") + st.pyplot(fig2) + except Exception as e: + st.info("Model does not provide probability predictions to compute ROC AUC.") + +# feature importance (if model supports it) +st.subheader("Feature importances (if available)") +base_model = pipeline.named_steps["clf"] +if hasattr(base_model, "feature_importances_"): + importances = base_model.feature_importances_ + fi = pd.Series(importances, index=features).sort_values(ascending=False) + st.bar_chart(fi) +elif hasattr(base_model, "coef_"): + coefs = np.abs(base_model.coef_).ravel() + fi = pd.Series(coefs, index=features).sort_values(ascending=False) + st.bar_chart(fi) +else: + st.info("Selected model has no feature_importances_ or coef_.") From 51fb603f1d6b68ea0752057c9141cc08e59ebaf9 Mon Sep 17 00:00:00 2001 From: Dev-Jeff28 <71960243+Dev-Jeff28@users.noreply.github.com> Date: Sat, 25 Oct 2025 09:24:24 +0530 Subject: [PATCH 3/5] Update Readme.md --- Data_Science/streamlit_ml_app/Readme.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Data_Science/streamlit_ml_app/Readme.md b/Data_Science/streamlit_ml_app/Readme.md index 0194ffe..ff7b146 100644 --- a/Data_Science/streamlit_ml_app/Readme.md +++ b/Data_Science/streamlit_ml_app/Readme.md @@ -1,4 +1,4 @@ -R# 🧠 Interactive Streamlit ML App +## 🧠 Interactive Streamlit ML App A small Streamlit web application that allows users to **upload a CSV file**, **choose a machine learning model**, and **view model performance metrics interactively**. @@ -24,3 +24,4 @@ pip install -r requirements.txt ▶️ Running the App streamlit run streamlit_app.py + From 14a9a39468302a82a97e20a23180ab39e78b5186 Mon Sep 17 00:00:00 2001 From: Dev-Jeff28 <71960243+Dev-Jeff28@users.noreply.github.com> Date: Sat, 25 Oct 2025 09:25:06 +0530 Subject: [PATCH 4/5] Update Readme.md --- Data_Science/streamlit_ml_app/Readme.md | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/Data_Science/streamlit_ml_app/Readme.md b/Data_Science/streamlit_ml_app/Readme.md index ff7b146..8326d56 100644 --- a/Data_Science/streamlit_ml_app/Readme.md +++ b/Data_Science/streamlit_ml_app/Readme.md @@ -2,11 +2,6 @@ A small Streamlit web application that allows users to **upload a CSV file**, **choose a machine learning model**, and **view model performance metrics interactively**. -This fulfills the GitHub issue: - -> **"Build a small Streamlit app in `/Data_Science/` that allows uploading a CSV, choosing a model, and viewing metrics interactively."** - ---- ## 🚀 Features @@ -25,3 +20,4 @@ pip install -r requirements.txt ▶️ Running the App streamlit run streamlit_app.py + From 56e3697200bdc6f055edee1a11131b144178f013 Mon Sep 17 00:00:00 2001 From: Dev-Jeff28 <71960243+Dev-Jeff28@users.noreply.github.com> Date: Sat, 25 Oct 2025 09:31:42 +0530 Subject: [PATCH 5/5] added a test.csv file for testing --- Data_Science/streamlit_ml_app/test.csv | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 Data_Science/streamlit_ml_app/test.csv diff --git a/Data_Science/streamlit_ml_app/test.csv b/Data_Science/streamlit_ml_app/test.csv new file mode 100644 index 0000000..a0da63d --- /dev/null +++ b/Data_Science/streamlit_ml_app/test.csv @@ -0,0 +1,21 @@ +age,income,loan_approved +25,40000,0 +35,60000,1 +45,80000,1 +30,50000,0 +50,90000,1 +28,42000,0 +42,75000,1 +39,65000,1 +33,48000,0 +55,100000,1 +26,41000,0 +31,53000,0 +48,85000,1 +29,46000,0 +41,70000,1 +38,62000,1 +52,95000,1 +34,56000,0 +47,88000,1 +40,72000,1