diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1c16435 --- /dev/null +++ b/.gitignore @@ -0,0 +1,37 @@ +# Visual Studio cache en gebruikersinstellingen +.vs/ +*.suo +*.user +*.userosscache +*.sln.docstates +*.VC.db + +# Visual Studio .db & .wal bestanden +*.db +*.db-shm +*.db-wal +*.vsidx + +# Python __pycache__ directory en .pyc bestanden +__pycache__/ +*.py[cod] + +# SQLite databasebestanden (optioneel — als die lokaal zijn) +*.sqlite + +# JetBrains IDE's (optioneel) +.idea/ + +# VS Code settings (optioneel) +.vscode/ + +# Systeembestanden +.DS_Store +Thumbs.db + +# Logs en tijdelijke bestanden +*.log +*.tmp + +# JSON uitsluiten (alleen als je dit echt wilt) +*.json diff --git a/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/CodeChunks.db b/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/CodeChunks.db new file mode 100644 index 0000000..a88c310 Binary files /dev/null and b/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/CodeChunks.db differ diff --git a/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/CodeChunks.db-shm b/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/CodeChunks.db-shm new file mode 100644 index 0000000..a2b5300 Binary files /dev/null and b/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/CodeChunks.db-shm differ diff --git a/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/CodeChunks.db-wal b/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/CodeChunks.db-wal new file mode 100644 index 0000000..271bd0e Binary files /dev/null and b/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/CodeChunks.db-wal differ diff --git a/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/SemanticSymbols.db b/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/SemanticSymbols.db new file mode 100644 index 0000000..0a973d7 Binary files /dev/null and b/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/SemanticSymbols.db differ diff --git a/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/SemanticSymbols.db-shm b/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/SemanticSymbols.db-shm new file mode 100644 index 0000000..3792802 Binary files /dev/null and b/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/SemanticSymbols.db-shm differ diff --git a/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/SemanticSymbols.db-wal b/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/SemanticSymbols.db-wal new file mode 100644 index 0000000..6cbf3ad Binary files /dev/null and b/.vs/project-2-eda-sql/CopilotIndices/17.14.670.39694/SemanticSymbols.db-wal differ diff --git a/.vs/project-2-eda-sql/v17/.wsuo b/.vs/project-2-eda-sql/v17/.wsuo new file mode 100644 index 0000000..43950a2 Binary files /dev/null and b/.vs/project-2-eda-sql/v17/.wsuo differ diff --git a/.vs/slnx.sqlite b/.vs/slnx.sqlite new file mode 100644 index 0000000..f0e52b7 Binary files /dev/null and b/.vs/slnx.sqlite differ diff --git a/Naamloze presentatie.pptx b/Naamloze presentatie.pptx new file mode 100644 index 0000000..3ba46ea Binary files /dev/null and b/Naamloze presentatie.pptx differ diff --git a/README.md b/README.md index 740c67f..5f9e3ea 100644 --- a/README.md +++ b/README.md @@ -1,45 +1,59 @@ -![logo_ironhack_blue 7](https://user-images.githubusercontent.com/23629340/40541063-a07a0a8a-601a-11e8-91b5-2f13e4e6b441.png) +# Delivery Insights Dashboard -# Business Challenge: EDA and SQL +## Overview -## Introduction +**Delivery Insights Dashboard** is a modular data analytics application focused on the procurement process and supplier performance. It transforms raw JSON datasets from internal APIs into insightful, interactive visualizations. The tool is intended to help supply chain analysts, procurement teams, and operational managers understand where deliveries deviate from planning and identify improvement opportunities. -A data project lifecycle has many phases, rather than being just an isolated analysis in a single tool. -In this project you will experience doing an analysis using both Python and SQL to obtain the final result, by exploring each tool's behavior. +The application is built using: -## Project Overview +- **Python** for backend data transformation and analytics +- **Pandas & NumPy** for data manipulation +- **Streamlit** for the interactive frontend UI +- **Plotly** for visually rich, customizable charts +- **Scipy** for statistical testing and significance analysis -Pick up a dataset in our common datasets repos and break your work into big steps: - 1. Pick a topic and choose a dataset on that topic. Build around 10 Business questions to answer about this topic. - - Try to build the questions before knowing everything about the data - - If not possible, do step 2. first - 2. Data Analysis: Understand your dataset and create a report (word document) about it - 3. Data Exploration and Business Understanding: - - Import your dataset into SQL - - Answer your Business questions with SQL Queries +All data transformations and visual outputs are dynamically generated based on user input, allowing deep dive exploration without needing to code. +--- -## Dataset repos +## Features - - [Kaggle](https://www.kaggle.com/) - - [Machine Learning Repository](https://archive.ics.uci.edu/) - - [PorData](https://www.pordata.pt/) - - [And many more](https://medium.com/@LearnPythonProgramming/best-data-sources-for-datasets-beyond-kaggle-98aac51e971e) +### ✅ Automated Data Ingestion +- Downloads and caches procurement and delivery datasets from local JSON endpoints +- Ensures repeatable and fail-safe fetching using fallback and logging logic +### 🧼 Robust Data Cleaning +- Utilizes a reusable `DataFrameCleaner` utility to standardize column types and formats +- Handles datetime conversion, missing values, string normalization, and invalid data filtering -## Bonus +### 📦 Delivery Performance Tracking +- Calculates **expected vs. actual delivery dates** per order line +- Derives key indicators such as: + - Whether a line was **fully delivered** + - Number of deliveries per order line + - Delay in days (positive or negative) relative to expected delivery - - Bonus points if you augment your data with data your obtain through WebScrapping - - Bonus points if you include visualizations from Python and/or Tableau in the final presentation +### 📊 Advanced Visualizations +- Uses **Plotly** for bar, line, and stacked visualizations +- Includes supplier filtering, top-X percent segmentation, and missing value detection +- Shows both **order-level** and **order-line-level** analyses -## Deliverables +### 📈 Timeliness & Trends +- Tracks monthly delivery frequency per supplier +- Visualizes how suppliers perform over time +- Automatically highlights most active suppliers -1. **Python Code:** Provide well-documented Python code that conducts the analysis and SQL upload. -2. **SQL text file (.sql)** well commented document with all the queries answering the Business questions -3. **Short Presentation:** Structure the presentation in the following way: - - Intro Slides: introduce the problem and the datasets - - Data cleaning and assumptions - - Business questions and SQL query (1 slide per question with a print screen of the query and the answer is enough) -4. **PDF Document** with notes you might want to share +### 📉 Statistical Insights +- Performs chi-squared tests for independence between delivery categories and responsible staff +- Calculates **Cramér’s V** to evaluate effect strength +- Flags statistically significant results and displays contingency tables interactively +### 🧭 Interactive Filtering +- Year selector: isolate one or multiple years of delivery data +- Supplier selector: choose specific suppliers or rely on automatic relevance filtering (top %) +- Modular layout in Streamlit tabs for clarity and drilldown + +--- + +## Project Structure diff --git a/__pycache__/cleanup.cpython-312.pyc b/__pycache__/cleanup.cpython-312.pyc new file mode 100644 index 0000000..639986f Binary files /dev/null and b/__pycache__/cleanup.cpython-312.pyc differ diff --git a/__pycache__/cleanup.cpython-313.pyc b/__pycache__/cleanup.cpython-313.pyc new file mode 100644 index 0000000..e95074c Binary files /dev/null and b/__pycache__/cleanup.cpython-313.pyc differ diff --git a/__pycache__/eda_service.cpython-312.pyc b/__pycache__/eda_service.cpython-312.pyc new file mode 100644 index 0000000..3d20cd9 Binary files /dev/null and b/__pycache__/eda_service.cpython-312.pyc differ diff --git a/__pycache__/eda_service.cpython-313.pyc b/__pycache__/eda_service.cpython-313.pyc new file mode 100644 index 0000000..da48ce7 Binary files /dev/null and b/__pycache__/eda_service.cpython-313.pyc differ diff --git a/__pycache__/loader.cpython-312.pyc b/__pycache__/loader.cpython-312.pyc new file mode 100644 index 0000000..50fa4b2 Binary files /dev/null and b/__pycache__/loader.cpython-312.pyc differ diff --git a/__pycache__/loader.cpython-313.pyc b/__pycache__/loader.cpython-313.pyc new file mode 100644 index 0000000..eb7204b Binary files /dev/null and b/__pycache__/loader.cpython-313.pyc differ diff --git a/__pycache__/ui.cpython-312.pyc b/__pycache__/ui.cpython-312.pyc new file mode 100644 index 0000000..0768b28 Binary files /dev/null and b/__pycache__/ui.cpython-312.pyc differ diff --git a/__pycache__/ui.cpython-313.pyc b/__pycache__/ui.cpython-313.pyc new file mode 100644 index 0000000..7c81891 Binary files /dev/null and b/__pycache__/ui.cpython-313.pyc differ diff --git a/cleanup.py b/cleanup.py new file mode 100644 index 0000000..aa3b4b9 --- /dev/null +++ b/cleanup.py @@ -0,0 +1,141 @@ +import pandas as pd +import numpy as np + +class DataFrameCleaner: + def __init__(self, df: pd.DataFrame, name: str = "DataFrame", log_enabled: bool = False): + """ + Initialize the cleaner using the input DataFrame directly (no copy). + + Parameters: + - df: The input pandas DataFrame to clean (modified in-place). + - name: Optional name used in logs to identify this cleaner instance. + - log_enabled: If True, log messages will be printed to stdout. + """ + self.df = df # Direct use, no .copy() + self.name = name + self.log_enabled = log_enabled + + def _log(self, message: str): + """ + Internal helper to print log messages only when logging is enabled. + """ + if self.log_enabled: + print(message) + + def drop_columns(self, columns: list): + """ + Drop specified columns from the DataFrame if they exist. + + Parameters: + - columns: List of column names to drop. + """ + self._log(f"\n=== {self.name} — Dropping Specified Columns ===") + existing_cols = [col for col in columns if col in self.df.columns] + missing_cols = [col for col in columns if col not in self.df.columns] + + if existing_cols: + self.df.drop(columns=existing_cols, inplace=True) + self._log(f"Dropped columns: {', '.join(existing_cols)}") + else: + self._log("No columns to drop.") + + if missing_cols: + self._log(f"Skipped (not found): {', '.join(missing_cols)}") + + def apply_dtype_mapping(self, mapping: dict = None): + """ + Apply data type conversions to columns as specified in the mapping. + + Parameters: + - mapping: Dictionary where keys are column names and values are target data types. + Supported types: 'datetime', 'numeric', 'str', 'bool', or any valid numpy/pandas dtype. + """ + self._log(f"\n=== {self.name} — Applying Type Mappings ===") + if mapping is None: + self._log("No mapping provided.") + return + + converted = [] + failed = [] + skipped = [] + + # Only keep columns that exist in the DataFrame + valid_mapping = {col: typ for col, typ in mapping.items() if col in self.df.columns} + skipped = [col for col in mapping if col not in self.df.columns] + + # Convert all datetime columns in bulk + datetime_cols = [col for col, typ in valid_mapping.items() if typ == 'datetime'] + if datetime_cols: + try: + self.df[datetime_cols] = self.df[datetime_cols].apply( + pd.to_datetime, errors='coerce', utc=True + ) + converted.extend([(col, 'datetime') for col in datetime_cols]) + except Exception: + failed.extend(datetime_cols) + + # Convert all 'str' and 'bool' columns using bulk astype + astype_map = {col: typ for col, typ in valid_mapping.items() if typ in ['str', 'bool']} + if astype_map: + try: + self.df = self.df.astype(astype_map) + converted.extend(astype_map.items()) + except Exception: + failed.extend(astype_map.keys()) + + # Handle other types individually (e.g. 'numeric', custom dtypes) + for col, typ in valid_mapping.items(): + if typ in ['datetime', 'str', 'bool']: + continue # Already handled + try: + if typ == 'numeric': + self.df[col] = pd.to_numeric(self.df[col], errors='coerce') + else: + self.df[col] = self.df[col].astype(typ) + converted.append((col, typ)) + except Exception: + failed.append(col) + + # Logging results + if converted: + self._log("Converted columns: " + ", ".join(f"{col}: {typ}" for col, typ in converted)) + if skipped: + self._log(f"Skipped (not in DataFrame): {', '.join(skipped)}") + if failed: + self._log(f"Failed to convert: {', '.join(failed)}") + + def rename_columns(self, rename_map: dict): + """ + Rename columns in the DataFrame using a provided mapping. + + Parameters: + - rename_map: Dictionary mapping old column names to new names. + """ + self._log(f"\n=== {self.name} — Renaming Columns ===") + existing = {k: v for k, v in rename_map.items() if k in self.df.columns} + missing = [k for k in rename_map if k not in self.df.columns] + + if existing: + self.df.rename(columns=existing, inplace=True) + self._log("Renamed columns: " + ", ".join(f"{k} -> {v}" for k, v in existing.items())) + else: + self._log("No columns were renamed.") + + if missing: + self._log(f"Skipped (not found): {', '.join(missing)}") + + def normalize_nones(self): + """ + Replace string values 'None' and 'null' (as text) with pandas NA (missing values). + """ + self._log(f"\n=== {self.name} — Replacing 'None'/'null' strings with NaN ===") + self.df.replace(["None", "null"], pd.NA, inplace=True) + + def get_cleaned_df(self): + """ + Return the cleaned DataFrame. + + Returns: + - pandas DataFrame after all applied transformations. + """ + return self.df diff --git a/eda_service.py b/eda_service.py new file mode 100644 index 0000000..cb23cff --- /dev/null +++ b/eda_service.py @@ -0,0 +1,164 @@ +import pandas as pd +import numpy as np + +class EDAService: + def __init__(self, df: pd.DataFrame, name: str = "DataFrame", preview_rows: int = 5): + self.df = df + self.name = name + self.preview_rows = preview_rows + + def run_step(self, step: int): + # Logical order of steps for effective EDA + if step == 1: + self.structure_overview() # Shape and types + elif step == 2: + self.sample_preview() # First few rows + elif step == 3: + self.missing_values() # Absolute missing counts + elif step == 4: + self.null_percentage() # Percentage missing + elif step == 5: + self.duplicate_rows(show_samples=False) + elif step == 6: + self.numeric_summary() + elif step == 7: + self.value_ranges() + elif step == 8: + self.categorical_summary(top_n=5) + elif step == 9: + self.correlation_matrix() + else: + print(f"Invalid step: {step}") + + def structure_overview(self, max_cols: int = 50): + # Purpose: Understand shape, types, and representative values clearly + print(f"\n=== {self.name} — Structure Overview ===") + row_count, col_count = self.df.shape + print(f"Rows: {row_count}") + print(f"Columns: {col_count}\n") + + print("Column counts by type:") + type_counts = self.df.dtypes.value_counts() + for dtype, count in type_counts.items(): + percentage = (count / col_count) * 100 + print(f"- {dtype}: {count} columns ({percentage:.1f}%)") + + print("\nExample column names by type:") + dtypes_series = self.df.dtypes.astype(str) + grouped = dtypes_series.groupby(dtypes_series) + for dtype, cols in grouped.groups.items(): + cols_list = list(cols) + shown_cols = cols_list[:max_cols] + print(f"\n{dtype} ({len(cols_list)} columns):") + print(", ".join(str(c) for c in shown_cols) + (f" ... (+{len(cols_list) - max_cols} more)" if len(cols_list) > max_cols else "")) + + print("\nNote:") + print("- object = likely strings, mixed types, or nested structures (e.g., dict/list)") + print("- int64 / float64 = numeric values") + print("- bool = True/False data") + print("- datetime64 = timestamps or date fields") + + print("\nRepresentative values per column:") + representative_values = {} + for col in self.df.columns: + non_null_series = self.df[col].dropna() + if not non_null_series.empty: + val = non_null_series.iloc[0] + else: + val = None + representative_values[col] = val + print(f"- {col}: {repr(val)}") + + print("\nColumns with only one unique value:") + single_value_columns = {} + for col in self.df.columns: + uniques = self.df[col].dropna().unique() + if self.df[col].nunique(dropna=False) == 1: + single_value = uniques[0] if len(uniques) > 0 else None + single_value_columns[col] = single_value + print(f"- {col}: {repr(single_value)}") + + return representative_values + + + + def sample_preview(self, max_cols: int = 20, show_all_rows: bool = False): + # Purpose: Get a feel for what the data looks like + print(f"\n=== {self.name} — Sample Preview ===") + pd.set_option("display.max_columns", max_cols) + print(self.df if show_all_rows else self.df.head(self.preview_rows)) + pd.reset_option("display.max_columns") + + def missing_values(self): + # Purpose: Identify which columns have missing values (absolute) + print(f"\n=== {self.name} — Missing Values (count) ===") + missing = self.df.isnull().sum() + missing = missing[missing > 0] + if missing.empty: + print("No missing values.") + else: + print(missing.sort_values(ascending=False)) + + def null_percentage(self): + # Purpose: Prioritize columns with the highest proportion of missing data + print(f"\n=== {self.name} — Missing Values (percentage) ===") + total = len(self.df) + nulls = (self.df.isnull().sum() / total * 100).sort_values(ascending=False) + nulls = nulls[nulls > 0] + if nulls.empty: + print("No missing value percentages above zero.") + else: + print(nulls.round(2)) + + def duplicate_rows(self, show_samples: bool = True): + # Purpose: Detect and optionally inspect duplicate rows (based on hashable columns) + print(f"\n=== {self.name} — Duplicate Rows ===") + hashable_cols = [col for col in self.df.columns if self.df[col].map(type).isin([int, float, str, bool, type(None)]).all()] + if not hashable_cols: + print("No hashable columns to detect duplicates.") + return + + try: + dupe_mask = self.df[hashable_cols].duplicated() + dupe_count = dupe_mask.sum() + print(f"Duplicate count (based on {len(hashable_cols)} hashable columns): {dupe_count}") + if dupe_count > 0 and show_samples: + print(self.df[dupe_mask].head()) + except Exception as e: + print(f"Error during duplicate detection: {e}") + + def numeric_summary(self): + # Purpose: Get descriptive statistics of numeric columns + print(f"\n=== {self.name} — Numeric Summary ===") + numeric_df = self.df.select_dtypes(include=[np.number]) + if numeric_df.empty: + print("No numeric columns available.") + else: + print(numeric_df.describe()) + + def value_ranges(self): + # Purpose: Check min and max values of numeric columns + print(f"\n=== {self.name} — Value Ranges (numeric) ===") + num_df = self.df.select_dtypes(include=[np.number]) + for col in num_df.columns: + print(f"{col}: min={num_df[col].min()}, max={num_df[col].max()}") + + def categorical_summary(self, top_n: int = 5): + # Purpose: Identify frequent values in categorical columns + print(f"\n=== {self.name} — Categorical Summary ===") + cat_cols = self.df.select_dtypes(include=["object", "category"]).columns + if not len(cat_cols): + print("No categorical columns found.") + return + for col in cat_cols: + print(f"\nColumn: {col}") + print(self.df[col].value_counts().head(top_n)) + + def correlation_matrix(self): + # Purpose: Explore relationships between numeric variables + print(f"\n=== {self.name} — Correlation Matrix ===") + num_df = self.df.select_dtypes(include=[np.number]) + if num_df.empty: + print("No numeric columns present.") + else: + print(num_df.corr(numeric_only=True)) \ No newline at end of file diff --git a/loader.py b/loader.py new file mode 100644 index 0000000..1b686c3 --- /dev/null +++ b/loader.py @@ -0,0 +1,70 @@ +import os +import json +import pandas as pd +import requests + +# Lokale directory waar de JSON-bestanden worden opgeslagen +DATA_DIR = "data" +os.makedirs(DATA_DIR, exist_ok=True) + +datasets = { + "Inkooporderregels": ("http://10.11.10.104:5100/F/Inkooporderregels_All.json", "Inkooporderregels_All.json"), + "Ontvangstregels": ("http://10.11.10.104:5100/F/Ontvangstregels.json", "Ontvangstregels.json"), + "Relaties": ("http://10.11.10.104:5100/F/Relaties.json", "Relaties.json"), + "FeedbackLeveranciers": ("http://10.11.10.104:5100/F/FeedbackLeveranciers.json", "FeedbackLeveranciers.json"), + "Leveranciers": ("http://10.11.10.104:5100/F/Leveranciers.json", "Leveranciers.json") # Added Leveranciers dataset +} + +def download_if_missing(url: str, filename: str, log: bool = False): + filepath = os.path.join(DATA_DIR, filename) + if not os.path.exists(filepath): + if log: + print(f"Downloading {filename} from {url} ...") + try: + response = requests.get(url) + response.raise_for_status() + with open(filepath, "w", encoding="utf-8") as f: + json.dump(response.json(), f, ensure_ascii=False) + if log: + print(f"Saved to {filepath}") + except Exception as e: + if log: + print(f"Failed to download {filename}: {e}") + raise + else: + if log: + print(f"Using cached file: {filepath}") + return filepath + +def load_nested_json_file(filepath: str, log: bool = False): + if log: + print(f"Loading file: {filepath}") + with open(filepath, encoding="utf-8") as f: + data = json.load(f) + + if isinstance(data, dict): + return pd.DataFrame(data.values()) + elif isinstance(data, list): + return pd.DataFrame(data) + else: + raise ValueError("Unsupported JSON structure") + +def load_all_datasets(log: bool = False): + try: + file_inkoop = download_if_missing(*datasets["Inkooporderregels"], log=log) + file_ontvangst = download_if_missing(*datasets["Ontvangstregels"], log=log) + file_relaties = download_if_missing(*datasets["Relaties"], log=log) + file_feedback = download_if_missing(*datasets["FeedbackLeveranciers"], log=log) + file_leveranciers = download_if_missing(*datasets["Leveranciers"], log=log) # Added Leveranciers download + + df_inkoop = load_nested_json_file(file_inkoop, log=log) + df_ontvangst = load_nested_json_file(file_ontvangst, log=log) + df_relaties = load_nested_json_file(file_relaties, log=log) + df_feedback = load_nested_json_file(file_feedback, log=log) + df_leveranciers = load_nested_json_file(file_leveranciers, log=log) # Added Leveranciers DataFrame + + return df_inkoop, df_ontvangst, df_relaties, df_feedback, df_leveranciers # Return Leveranciers dataframe + except Exception as e: + if log: + print(f"Error loading datasets: {e}") + raise diff --git a/main.py b/main.py new file mode 100644 index 0000000..5b42453 --- /dev/null +++ b/main.py @@ -0,0 +1,146 @@ +# ----------------------------- +# Imports and Initial Setup +# ----------------------------- +from cleanup import DataFrameCleaner +from loader import load_all_datasets +from ui import UI + + +# ----------------------------- +# Load Datasets +# ----------------------------- +try: + df_inkooporderregels, df_ontvangstregels, df_relaties, df_feedback, df_suppliers = load_all_datasets(True) +except Exception: + exit(1) + +# ----------------------------- +# Configuration +# ----------------------------- +relevant_columns_inkoop = [ + 'GuLiIOR', 'Datum', 'DatumToegezegd', 'AfwijkendeAfleverdatum', + 'Naam', 'BronRegelGUID', 'QuUn', 'OrNu', 'DsEx', 'StatusOrder', 'Verantwoordelijke' +] +relevant_columns_ontvangst = [ + 'BronregelGuid', 'Datum', 'AantalOntvangen', 'Status_regel', 'Itemcode', 'Naam' +] + +inkoop_columns_to_convert = { + 'Datum': 'datetime', + 'DatumToegezegd': 'datetime', + 'AfwijkendeAfleverdatum': 'datetime', + 'Vrijgegeven_op': 'datetime', + 'getDate': 'datetime', + 'Naam': 'str' +} +ontvangst_columns_to_convert = { + 'Datum': 'datetime' +} + +# ----------------------------- +# Cleaning +# ----------------------------- +cleaner_inkoop = DataFrameCleaner(df_inkooporderregels, name="df_inkooporderregels") +cleaner_inkoop.apply_dtype_mapping(inkoop_columns_to_convert) +df_inkooporderregels_clean = cleaner_inkoop.get_cleaned_df()[relevant_columns_inkoop].copy() + +cleaner_ontvangst = DataFrameCleaner(df_ontvangstregels, name="df_ontvangstregels") +cleaner_ontvangst.apply_dtype_mapping(ontvangst_columns_to_convert) +df_ontvangstregels_clean = cleaner_ontvangst.get_cleaned_df()[relevant_columns_ontvangst].copy() + +# ----------------------------- +# Filter: remove irrelevant rows +# ----------------------------- +# Hier verwijderen we de order regels waarvan standaard geen verzending wordt ingvuld of deze toch niet relevant is +df_inkooporderregels_clean = df_inkooporderregels_clean[df_inkooporderregels_clean['DsEx'] != 'KVERZEND'].copy() + +# ----------------------------- +# Determine Expected Delivery Date +# ----------------------------- +# Bepaal de verwachte leverdatum per regel: +# Gebruik 'AfwijkendeAfleverdatum' als primaire bron, en val terug op 'DatumToegezegd' indien nodig. +# Voor de duidelijkheid dit is de datum waarop een order geleverd zou moeten zijn +df_inkooporderregels_clean['ExpectedDeliveryDate'] = df_inkooporderregels_clean['AfwijkendeAfleverdatum'].combine_first( + df_inkooporderregels_clean['DatumToegezegd'] +) +# Verwijder de kollommen die we nu niet meer nodig hebben +df_inkooporderregels_clean.drop(columns=['AfwijkendeAfleverdatum', 'DatumToegezegd'], inplace=True) + +# Filter regels: +# - Alleen regels behouden waar zowel 'Datum' (orderdatum) als 'ExpectedDeliveryDate' gevuld is +# - Alleen regels behouden waar de verwachte leverdatum op of ná de orderdatum ligt +# (levering vóór bestelling is niet logisch, dus die regels worden verwijderd) +df_inkooporderregels_clean = df_inkooporderregels_clean[ + df_inkooporderregels_clean['ExpectedDeliveryDate'].notna() & # ExpectedDeliveryDate moet ingevuld zijn + df_inkooporderregels_clean['Datum'].notna() & # Orderdatum moet ingevuld zijn + (df_inkooporderregels_clean['ExpectedDeliveryDate'] >= df_inkooporderregels_clean['Datum']) # Geen leverdatum vóór orderdatum +].copy() + +# Determine max expected date per order +# Dit is de datum waarop de laatste order regel binnen zou moeten zijn en dus de uiteindelijke leverdatum +latest_expected_per_order = df_inkooporderregels_clean.groupby('OrNu')['ExpectedDeliveryDate'].max() + +# Add to dataframe +df_inkooporderregels_clean['OrderDeliveryDate'] = df_inkooporderregels_clean['OrNu'].map(latest_expected_per_order) +df_inkooporderregels_clean['OrderDeliveryDate'] = df_inkooporderregels_clean['OrderDeliveryDate'].dt.tz_localize(None) +df_inkooporderregels_clean['Datum'] = df_inkooporderregels_clean['Datum'].dt.tz_localize(None) + +# ----------------------------- +# Delivery Data Preparation +# ----------------------------- + +# Tel per regel-GUID hoe vaak er een levering op plaatsvond (meerdere leveringen mogelijk) +delivery_counts = df_ontvangstregels_clean['BronregelGuid'].value_counts() + +# Bepaal het totaal aantal ontvangen stuks per regel-GUID +total_received = df_ontvangstregels_clean.groupby('BronregelGuid')['AantalOntvangen'].sum() + +# Analyseer per inkoopregel of en hoeveel er geleverd is +def analyse_leveringen(df_subset, delivery_counts, total_received): + df_subset = df_subset.copy() + + # Aantal keer dat er op deze regel iets is geleverd + df_subset['DeliveryCount'] = df_subset['GuLiIOR'].map(delivery_counts).fillna(0).astype(int) + + # Totaal aantal ontvangen eenheden voor deze regel + df_subset['TotalReceived'] = df_subset['GuLiIOR'].map(total_received).fillna(0).astype(float) + + # Zorg dat QuUn (besteld aantal) niet NaN is + df_subset['QuUn'] = df_subset['QuUn'].fillna(0).astype(float) + + # Markeer of alles volledig is geleverd + df_subset['FullyDelivered'] = df_subset['TotalReceived'] >= df_subset['QuUn'] + + return df_subset + +# ----------------------------- +# Delivery Analysis +# ----------------------------- + +# Pas leveringsanalyse toe op alle regels met verwachte leverdatum +df_inkooporderregels_clean = analyse_leveringen(df_inkooporderregels_clean, delivery_counts, total_received) + +# ----------------------------- +# Calculate Delivery Delay +# ----------------------------- + +# Bepaal per regel de laatste bekende leverdatum op basis van ontvangstregels +df_inkooporderregels_clean['DeliveryDate'] = df_inkooporderregels_clean['GuLiIOR'].map( + df_ontvangstregels_clean.groupby('BronregelGuid')['Datum'].max() +) + +# Bereken afwijking tussen werkelijke en verwachte leverdatum (alleen waar beide datums beschikbaar zijn) +mask = df_inkooporderregels_clean['DeliveryDate'].notna() & df_inkooporderregels_clean['ExpectedDeliveryDate'].notna() +df_inkooporderregels_clean.loc[mask, 'DeliveryDelay'] = ( + df_inkooporderregels_clean.loc[mask, 'DeliveryDate'] - df_inkooporderregels_clean.loc[mask, 'ExpectedDeliveryDate'] +).dt.days + + +# ----------------------------- +# Optional: Hook up to UI +# ----------------------------- +ui = UI(df_inkooporderregels_clean) +ui.year_selection() +ui.supplier_selection() +ui.show_date_analysis() + diff --git a/ui.py b/ui.py new file mode 100644 index 0000000..33fc175 --- /dev/null +++ b/ui.py @@ -0,0 +1,337 @@ +import streamlit as st +import pandas as pd +import plotly.express as px +from scipy.stats import chi2_contingency + +st.set_page_config(layout="wide") + +class UI: + def __init__(self, df): + self.original_df = df.copy() + self.selected_years = [] + self.selected_suppliers = [] + self.filtered_df = df.copy() + self.top_percent = 10 + + def year_selection(self): + all_years = sorted(self.original_df['Datum'].dt.year.unique()) + self.selected_years = st.multiselect( + 'Select one or more years (leave empty to include all):', + options=all_years, + default=[] + ) + + if self.selected_years: + self.filtered_df = self.original_df[self.original_df['Datum'].dt.year.isin(self.selected_years)] + else: + self.filtered_df = self.original_df.copy() + + def supplier_selection(self): + if self.filtered_df.empty: + st.warning("No data available.") + return + + suppliers = sorted(self.filtered_df['Naam'].dropna().unique()) + self.selected_suppliers = st.multiselect('Select suppliers:', suppliers) + + use_percentage = len(self.selected_suppliers) == 0 + + if use_percentage: + with st.expander("Advanced filter (top % of suppliers)", expanded=False): + self.top_percent = st.slider( + label="Top % suppliers (only active if no supplier is manually selected):", + min_value=1, + max_value=100, + value=10, + format="%d%%", + label_visibility="collapsed" + ) + st.caption(f"No supplier selected. Filter shows top {self.top_percent}% suppliers sorted by relevance.") + else: + self.filtered_df = self.filtered_df[self.filtered_df['Naam'].isin(self.selected_suppliers)] + self.top_percent = None + st.caption(f"{len(self.selected_suppliers)} supplier(s) selected. Top-% filter is deactivated.") + + def show_date_analysis(self): + if self.filtered_df.empty: + st.warning("No data available after filtering.") + return + + year_label = ", ".join(map(str, self.selected_years)) if self.selected_years else "all years" + st.subheader(f"Delivery Analysis for {year_label}") + + total_orders = self.filtered_df['OrNu'].nunique() if 'OrNu' in self.filtered_df.columns else 0 + total_order_lines = len(self.filtered_df) + total_suppliers = self.filtered_df['Naam'].nunique() + fully_delivered = self.filtered_df[self.filtered_df['FullyDelivered'] == True].shape[0] + + col1, col2, col3, col4 = st.columns(4) + col1.metric("Total Orders", total_orders) + col2.metric("Total Order Lines", total_order_lines) + col3.metric("Total Suppliers", total_suppliers) + col4.metric("Fully Delivered Lines", fully_delivered) + + tab_groups = st.tabs([ + "Per Order", + "Per Order Line", + "Timeliness & Trends", + "Responsibility" + ]) + + with tab_groups[0]: + self.plot_order_delivery_summary() + with tab_groups[1]: + self.plot_orderline_delivery_summary() + self.plot_delivery_counts() + self.plot_missing_delivery_date() + self.plot_fully_delivered() + with tab_groups[2]: + self.plot_performance_over_time() + with tab_groups[3]: + self.plot_orderline_delivery_by_responsible() + + def plot_order_delivery_summary(self): + st.info("Shows how many full orders were delivered early, on time, or late per supplier. An order consists of multiple lines.") + st.caption("More on-time and early deliveries is better.") + + df = self.filtered_df.copy() + if 'OrNu' not in df.columns: + st.warning("Order number (OrNu) not found in data.") + return + + grouped = df.groupby('OrNu').agg({ + 'ExpectedDeliveryDate': 'max', + 'DeliveryDate': 'max', + 'FullyDelivered': 'all', + 'Naam': 'first' + }).reset_index() + + grouped['DeliveryDelay'] = (grouped['DeliveryDate'] - grouped['ExpectedDeliveryDate']).dt.days + grouped['Category'] = grouped['DeliveryDelay'].apply( + lambda x: 'Early' if x < 0 else 'On Time' if x == 0 else 'Late' + ) + + summary = grouped.groupby(['Naam', 'Category']).size().reset_index(name='Count') + pivot_df = summary.pivot(index='Naam', columns='Category', values='Count').fillna(0) + if not pivot_df.empty: + pivot_df['Total'] = pivot_df.sum(axis=1) + pivot_df = pivot_df.sort_values(by='Total', ascending=False) + if self.top_percent is not None: + top_x = max(1, int(len(pivot_df) * self.top_percent / 100)) + pivot_df = pivot_df.head(top_x) + pivot_df = pivot_df.drop(columns='Total') + + pivot_df = pivot_df.reset_index() + + fig = px.bar( + pivot_df, + x='Naam', + y=['Early', 'On Time', 'Late'], + title="Order-level Delivery Timeliness per Supplier", + labels={'value': 'Number of Orders', 'variable': 'Category'}, + hover_name='Naam' + ) + fig.update_layout(barmode='stack', xaxis_tickangle=-45) + st.plotly_chart(fig, use_container_width=True) + + def plot_orderline_delivery_summary(self): + st.info("Shows how many order lines were delivered early, on time, or late per supplier.") + st.caption("More on-time and early deliveries is better.") + + df = self.filtered_df.copy() + df = df.dropna(subset=['ExpectedDeliveryDate', 'DeliveryDate']) + + if df.empty: + st.warning("No usable data for analysis.") + return + + df['DeliveryDelay'] = (df['DeliveryDate'] - df['ExpectedDeliveryDate']).dt.days + df['Category'] = df['DeliveryDelay'].apply( + lambda x: 'Early' if x < 0 else 'On Time' if x == 0 else 'Late' + ) + + summary = df.groupby(['Naam', 'Category']).size().reset_index(name='Count') + pivot_df = summary.pivot(index='Naam', columns='Category', values='Count').fillna(0) + + if not pivot_df.empty: + pivot_df['Total'] = pivot_df.sum(axis=1) + pivot_df = pivot_df.sort_values(by='Total', ascending=False) + if self.top_percent is not None: + top_x = max(1, int(len(pivot_df) * self.top_percent / 100)) + pivot_df = pivot_df.head(top_x) + pivot_df = pivot_df.drop(columns='Total') + + pivot_df = pivot_df.reset_index() + + fig = px.bar( + pivot_df, + x='Naam', + y=['Early', 'On Time', 'Late'], + title="Order Line-level Delivery Timeliness per Supplier", + labels={'value': 'Number of Order Lines', 'variable': 'Category'}, + hover_name='Naam' + ) + fig.update_layout(barmode='stack', xaxis_tickangle=-45) + st.plotly_chart(fig, use_container_width=True) + + def plot_delivery_counts(self): + st.info("Shows the total number of delivery moments per supplier, measured at the line level.") + st.caption("More deliveries is better.") + + grouped = self.filtered_df.groupby('Naam')['DeliveryCount'].sum().reset_index() + if grouped.empty: + st.info("No deliveries registered.") + return + + grouped = grouped.sort_values(by='DeliveryCount', ascending=False) + if self.top_percent is not None: + top_x = max(1, int(len(grouped) * self.top_percent / 100)) + grouped = grouped.head(top_x) + + fig = px.bar(grouped, x='Naam', y='DeliveryCount', + title="Total Deliveries per Supplier (Order Line Level)", + hover_data=['Naam', 'DeliveryCount'], + color_discrete_sequence=['orange']) + fig.update_layout(xaxis_tickangle=-45) + st.plotly_chart(fig, use_container_width=True) + + def plot_missing_delivery_date(self): + st.info("Indicates how many order lines per supplier do not have a delivery date yet.") + st.caption("Lower is better.") + + df = self.filtered_df[self.filtered_df['DeliveryDate'].isna()] + if df.empty: + st.info("All order lines are delivered.") + return + + counts = df['Naam'].value_counts().reset_index() + counts.columns = ['Supplier', 'Count'] + counts = counts.sort_values(by='Count', ascending=False) + if self.top_percent is not None: + top_x = max(1, int(len(counts) * self.top_percent / 100)) + counts = counts.head(top_x) + + fig = px.bar(counts, x='Supplier', y='Count', + title="Order Lines without Actual Delivery Date", + hover_data=['Supplier', 'Count'], + labels={'Count': 'Number of Order Lines'}, + height=400) + fig.update_layout(xaxis_tickangle=-45) + st.plotly_chart(fig, use_container_width=True) + + def plot_fully_delivered(self): + st.info("Shows per supplier the number of order lines that were fully delivered.") + st.caption("More is better.") + + delivered = self.filtered_df[self.filtered_df['FullyDelivered'] == True] + if delivered.empty: + st.info("No fully delivered order lines found.") + return + + counts = delivered['Naam'].value_counts().reset_index() + counts.columns = ['Supplier', 'Count'] + counts = counts.sort_values(by='Count', ascending=False) + if self.top_percent is not None: + top_x = max(1, int(len(counts) * self.top_percent / 100)) + counts = counts.head(top_x) + + fig = px.bar(counts, x='Supplier', y='Count', + title="Fully Delivered Order Lines", + hover_data=['Supplier', 'Count'], + color_discrete_sequence=['lightgreen']) + fig.update_layout(xaxis_tickangle=-45) + st.plotly_chart(fig, use_container_width=True) + + def plot_performance_over_time(self): + st.info("Visualizes the monthly frequency of deliveries per supplier.") + st.caption("More deliveries per month is better.") + + df = self.filtered_df.copy() + df['YearMonth'] = df['Datum'].dt.to_period('M').astype(str) + timeseries = df.groupby(['YearMonth', 'Naam'])['DeliveryCount'].sum().reset_index() + if timeseries.empty: + st.info("No time-based delivery data available.") + return + + supplier_totals = timeseries.groupby('Naam')['DeliveryCount'].sum() + if self.top_percent is not None: + top_x = max(1, int(len(supplier_totals) * self.top_percent / 100)) + top_suppliers = supplier_totals.sort_values(ascending=False).head(top_x).index + filtered_timeseries = timeseries[timeseries['Naam'].isin(top_suppliers)] + else: + filtered_timeseries = timeseries + + fig = px.line(filtered_timeseries, x='YearMonth', y='DeliveryCount', color='Naam', + title="Monthly Delivery Frequency", + hover_data=['Naam', 'DeliveryCount'], + markers=True) + fig.update_layout(xaxis_tickangle=-45) + st.plotly_chart(fig, use_container_width=True) + + def plot_orderline_delivery_by_responsible(self): + st.info("Shows how many order lines were delivered early, on time, or late per responsible person.") + st.caption("Analysis is based on order line level. Only top 5 responsible persons are included in chi-square test.") + + df = self.filtered_df.copy() + df = df.dropna(subset=['ExpectedDeliveryDate', 'DeliveryDate', 'Verantwoordelijke']) + + if df.empty: + st.info("No usable data for analysis.") + return + + df['DeliveryDelay'] = (df['DeliveryDate'] - df['ExpectedDeliveryDate']).dt.days + df['Category'] = df['DeliveryDelay'].apply( + lambda x: 'Early' if x < 0 else 'On Time' if x == 0 else 'Late' + ) + + top5 = df['Verantwoordelijke'].value_counts().nlargest(5).index + df['VerantwoordelijkeTop5'] = df['Verantwoordelijke'].apply(lambda x: x if x in top5 else 'Other') + df_top5 = df[df['VerantwoordelijkeTop5'] != 'Other'] + + if df_top5.empty: + st.info("No data available for top 5 responsible persons.") + return + + observed = pd.crosstab(df_top5['VerantwoordelijkeTop5'], df_top5['Category']) + chi2_stat, p_val, dof, expected = chi2_contingency(observed) + + n = observed.to_numpy().sum() + phi2 = chi2_stat / n + r, k = observed.shape + cramers_v = (phi2 / min(k - 1, r - 1)) ** 0.5 + + st.markdown("#### Actual frequencies per responsible person") + st.dataframe(observed, use_container_width=True) + + col1, col2, col3 = st.columns(3) + col1.metric("Chi²", f"{chi2_stat:.2f}") + col2.metric("p-value", f"{p_val:.4f}") + col3.metric("Cramér's V", f"{cramers_v:.3f}") + + if isinstance(p_val, float) and p_val < 0.05: + st.success("There is a statistically significant association (p < 0.05).") + else: + st.info("No statistically significant association (p ≥ 0.05).") + + relative = observed.div(observed.sum(axis=1), axis=0) * 100 + df_plot = relative.reset_index().melt( + id_vars=relative.index.name or "VerantwoordelijkeTop5", + var_name='Category', + value_name='Percentage' + ) + + fig = px.bar( + df_plot, + x='VerantwoordelijkeTop5', + y='Percentage', + color='Category', + title="Relative distribution of delivery categories per responsible person (Order Line Level)", + labels={'Percentage': '% of lines', 'VerantwoordelijkeTop5': 'Responsible'} + ) + fig.update_layout( + barmode='stack', + yaxis=dict(title="% of lines", ticksuffix='%'), + xaxis_tickangle=-45, + height=400 + ) + st.plotly_chart(fig, use_container_width=True)