Merge pull request #550 from Jeet009/mlOps

moe18 · web-flow · commit 7998b03b5e72 · 2025-10-23T12:39:31.000-04:00
Build a Simple ETL Pipeline (MLOps)
diff --git a/build/184.json b/build/184.json
@@ -0,0 +1,38 @@
+{
+  "id": "184",
+  "title": "Build a Simple ETL Pipeline (MLOps)",
+  "difficulty": "medium",
+  "category": "MLOps",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": [
+    {
+      "profile_link": "https://github.com/Jeet009",
+      "name": "Jeet Mukherjee"
+    }
+  ],
+  "description": "## Problem\n\nImplement a simple ETL (Extract-Transform-Load) pipeline for model-ready data preparation.\n\nGiven a CSV-like string containing user events with columns: `user_id,event_type,value` (header included), write a function `run_etl(csv_text)` that:\n\n1. Extracts rows from the raw CSV text.\n2. Transforms data by:\n\t- Filtering only rows where `event_type == \"purchase\"`.\n\t- Converting `value` to float and dropping invalid rows.\n\t- Aggregating total purchase `value` per `user_id`.\n3. Loads the transformed results by returning a list of `(user_id, total_value)` tuples sorted by `user_id` ascending.\n\nAssume small inputs (no external libs), handle extra whitespace, and ignore blank lines.",
+  "learn_section": "## Solution Explanation\n\nThis task mirrors a minimal MLOps ETL flow that prepares data for downstream modeling.\n\n### ETL breakdown\n- Extract: parse raw CSV text, ignore blanks, and split into header and rows.\n- Transform:\n\t- Filter only relevant records (event_type == \"purchase\").\n\t- Cast `value` to float; discard invalid rows to maintain data quality.\n\t- Aggregate total purchase value per user to create compact features.\n- Load: return a deterministic, sorted list of `(user_id, total_value)`.\n\n### Why this design?\n- Input sanitation prevents runtime errors and poor-quality features.\n- Aggregation compresses event-level logs into user-level features commonly used in models.\n- Sorting produces stable, testable outputs.\n\n### Complexity\n- For N rows, parsing and aggregation run in O(N); sorting unique users U costs O(U log U).\n\n### Extensions\n- Add schema validation and logging.\n- Write outputs to files or databases.\n- Schedule ETL runs and add monitoring for drift and freshness.",
+  "starter_code": "# Implement your function below.\n\ndef run_etl(csv_text: str) -> list[tuple[str, float]]:\n\t\"\"\"Run a simple ETL pipeline over CSV text with header user_id,event_type,value.\n\n\tReturns a sorted list of (user_id, total_value) for event_type == \"purchase\".\n\t\"\"\"\n\t# TODO: implement extract, transform, and load steps\n\traise NotImplementedError",
+  "solution": "from typing import List, Tuple\n\n\ndef run_etl(csv_text: str) -> List[Tuple[str, float]]:\n\t\"\"\"Reference ETL implementation.\n\n\t- Extract: parse CSV text, skip header, strip whitespace, ignore blanks\n\t- Transform: keep event_type == \"purchase\"; parse value as float; aggregate per user\n\t- Load: return sorted list of (user_id, total_value) by user_id asc\n\t\"\"\"\n\tlines = [line.strip() for line in csv_text.splitlines() if line.strip()]\n\tif not lines:\n\t\treturn []\n\t# header\n\theader = lines[0]\n\trows = lines[1:]\n\n\t# indices from header (allow varying order and case)\n\theaders = [h.strip().lower() for h in header.split(\",\")]\n\ttry:\n\t\tidx_user = headers.index(\"user_id\")\n\t\tidx_event = headers.index(\"event_type\")\n\t\tidx_value = headers.index(\"value\")\n\texcept ValueError:\n\t\t# header missing required columns\n\t\treturn []\n\n\taggregates: dict[str, float] = {}\n\tfor row in rows:\n\t\tparts = [c.strip() for c in row.split(\",\")]\n\t\tif len(parts) <= max(idx_user, idx_event, idx_value):\n\t\t\tcontinue\n\t\tuser_id = parts[idx_user]\n\t\tevent_type = parts[idx_event].lower()\n\t\tif event_type != \"purchase\":\n\t\t\tcontinue\n\t\ttry:\n\t\t\tvalue = float(parts[idx_value])\n\t\texcept ValueError:\n\t\t\tcontinue\n\t\taggregates[user_id] = aggregates.get(user_id, 0.0) + value\n\n\treturn sorted(aggregates.items(), key=lambda kv: kv[0])",
+  "example": {
+    "input": "run_etl(\"user_id,event_type,value\\n u1, purchase, 10.0\\n u2, view, 1.0\\n u1, purchase, 5\\n u3, purchase, not_a_number\\n u2, purchase, 3.5 \\n\\n\")",
+    "output": "[('u1', 15.0), ('u2', 3.5)]",
+    "reasoning": "Keep only purchases; convert values; drop invalid; aggregate per user; sort by user_id."
+  },
+  "test_cases": [
+    {
+      "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n u1, purchase, 10.0\n u2, view, 1.0\n u1, purchase, 5\n u3, purchase, not_a_number\n u2, purchase, 3.5 \n'))",
+      "expected_output": "[('u1', 15.0), ('u2', 3.5)]"
+    },
+    {
+      "test": "from solution import run_etl; print(run_etl('user_id,event_type,value\n'))",
+      "expected_output": "[]"
+    },
+    {
+      "test": "from solution import run_etl; print(run_etl('value,event_type,user_id\n 1.0, purchase, u1\n 2.0, purchase, u1\n'))",
+      "expected_output": "[('u1', 3.0)]"
+    }
+  ]
+}
diff --git a/build/185.json b/build/185.json
@@ -0,0 +1,38 @@
+{
+  "id": "185",
+  "title": "Basic Data Drift Check: Mean and Variance Thresholds",
+  "difficulty": "easy",
+  "category": "MLOps",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": [
+    {
+      "profile_link": "https://github.com/Jeet009",
+      "name": "Jeet Mukherjee"
+    }
+  ],
+  "description": "## Problem\n\nImplement a basic data drift check comparing two numeric datasets (reference vs. current).\n\nWrite a function `check_drift(ref, cur, mean_threshold, var_threshold)` that:\n\n- Accepts two lists of numbers `ref` and `cur`.\n- Computes the absolute difference in means and variances.\n- Returns a tuple `(mean_drift, var_drift)` where each element is a boolean indicating whether drift exceeds the corresponding threshold:\n\t- `mean_drift = abs(mean(ref) - mean(cur)) > mean_threshold`\n\t- `var_drift  = abs(var(ref)  - var(cur))  > var_threshold`\n\nAssume population variance (divide by N). Handle empty inputs by returning `(False, False)`.",
+  "learn_section": "## Solution Explanation\n\nWe compare two numeric samples (reference vs. current) using mean and variance with user-defined thresholds.\n\n### Definitions\n- Mean: \\( \\mu = \\frac{1}{N}\\sum_i x_i \\)\n- Population variance: \\( \\sigma^2 = \\frac{1}{N}\\sum_i (x_i - \\mu)^2 \\)\n\n### Drift rules\n- Mean drift if \\(|\\mu_{ref} - \\mu_{cur}| > \\tau_{mean}\\)\n- Variance drift if \\(|\\sigma^2_{ref} - \\sigma^2_{cur}| > \\tau_{var}\\)\n\n### Edge cases\n- If either sample is empty, return `(False, False)` to avoid false alarms.\n- Population vs. sample variance: we use population here to match many monitoring setups. Either is fine if used consistently.\n\n### Complexity\n- O(N + M) to compute stats; O(1) extra space.",
+  "starter_code": "from typing import List, Tuple\n\n\ndef check_drift(ref: List[float], cur: List[float], mean_threshold: float, var_threshold: float) -> Tuple[bool, bool]:\n\t\"\"\"Return (mean_drift, var_drift) comparing ref vs cur with given thresholds.\n\n\tUse population variance.\n\t\"\"\"\n\t# TODO: handle empty inputs; compute means and variances; compare with thresholds\n\traise NotImplementedError",
+  "solution": "from typing import List, Tuple\n\n\ndef _mean(xs: List[float]) -> float:\n\treturn sum(xs) / len(xs) if xs else 0.0\n\n\ndef _var(xs: List[float]) -> float:\n\tif not xs:\n\t\treturn 0.0\n\tm = _mean(xs)\n\treturn sum((x - m) * (x - m) for x in xs) / len(xs)\n\n\ndef check_drift(ref: List[float], cur: List[float], mean_threshold: float, var_threshold: float) -> Tuple[bool, bool]:\n\tif not ref or not cur:\n\t\treturn (False, False)\n\tmean_ref = _mean(ref)\n\tmean_cur = _mean(cur)\n\tvar_ref = _var(ref)\n\tvar_cur = _var(cur)\n\tmean_drift = abs(mean_ref - mean_cur) > mean_threshold\n\tvar_drift = abs(var_ref - var_cur) > var_threshold\n\treturn (mean_drift, var_drift)",
+  "example": {
+    "input": "check_drift([1, 2, 3], [1.1, 2.2, 3.3], 0.05, 0.1)",
+    "output": "(True, True)",
+    "reasoning": "Mean(ref)=2.0, Mean(cur)=2.2 → |Δ|=0.2>0.05. Var(ref)=2/3≈0.667; Var(cur)=1.21×0.667≈0.807 → |Δ|≈0.14>0.1."
+  },
+  "test_cases": [
+    {
+      "test": "from solution import check_drift; print(check_drift([1,2,3], [1.1,2.2,3.3], 0.05, 0.1))",
+      "expected_output": "(True, True)"
+    },
+    {
+      "test": "from solution import check_drift; print(check_drift([0,0,0], [0,0,0], 0.01, 0.01))",
+      "expected_output": "(False, False)"
+    },
+    {
+      "test": "from solution import check_drift; print(check_drift([], [1,2,3], 0.01, 0.01))",
+      "expected_output": "(False, False)"
+    }
+  ]
+}
diff --git a/questions/187_mlops-etl-pipeline/description.md b/questions/187_mlops-etl-pipeline/description.md
@@ -0,0 +1,14 @@
+## Problem
+
+Implement a simple ETL (Extract-Transform-Load) pipeline for model-ready data preparation.
+
+Given a CSV-like string containing user events with columns: `user_id,event_type,value` (header included), write a function `run_etl(csv_text)` that:
+
+1. Extracts rows from the raw CSV text.
+2. Transforms data by:
+	- Filtering only rows where `event_type == "purchase"`.
+	- Converting `value` to float and dropping invalid rows.
+	- Aggregating total purchase `value` per `user_id`.
+3. Loads the transformed results by returning a list of `(user_id, total_value)` tuples sorted by `user_id` ascending.
+
+Assume small inputs (no external libs), handle extra whitespace, and ignore blank lines.
diff --git a/questions/187_mlops-etl-pipeline/example.json b/questions/187_mlops-etl-pipeline/example.json
@@ -0,0 +1,5 @@
+{
+  "input": "run_etl(\"user_id,event_type,value\\n u1, purchase, 10.0\\n u2, view, 1.0\\n u1, purchase, 5\\n u3, purchase, not_a_number\\n u2, purchase, 3.5 \\n\\n\")",
+  "output": "[('u1', 15.0), ('u2', 3.5)]",
+  "reasoning": "Keep only purchases; convert values; drop invalid; aggregate per user; sort by user_id."
+}
diff --git a/questions/187_mlops-etl-pipeline/learn.md b/questions/187_mlops-etl-pipeline/learn.md
@@ -0,0 +1,24 @@
+## Solution Explanation
+
+This task mirrors a minimal MLOps ETL flow that prepares data for downstream modeling.
+
+### ETL breakdown
+- Extract: parse raw CSV text, ignore blanks, and split into header and rows.
+- Transform:
+	- Filter only relevant records (event_type == "purchase").
+	- Cast `value` to float; discard invalid rows to maintain data quality.
+	- Aggregate total purchase value per user to create compact features.
+- Load: return a deterministic, sorted list of `(user_id, total_value)`.
+
+### Why this design?
+- Input sanitation prevents runtime errors and poor-quality features.
+- Aggregation compresses event-level logs into user-level features commonly used in models.
+- Sorting produces stable, testable outputs.
+
+### Complexity
+- For N rows, parsing and aggregation run in O(N); sorting unique users U costs O(U log U).
+
+### Extensions
+- Add schema validation and logging.
+- Write outputs to files or databases.
+- Schedule ETL runs and add monitoring for drift and freshness.
diff --git a/questions/187_mlops-etl-pipeline/meta.json b/questions/187_mlops-etl-pipeline/meta.json
@@ -0,0 +1,12 @@
+{
+  "id": "187",
+  "title": "Build a Simple ETL Pipeline (MLOps)",
+  "difficulty": "medium",
+  "category": "MLOps",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": [
+    { "profile_link": "https://github.com/Jeet009", "name": "Jeet Mukherjee" }
+  ]
+}
diff --git a/questions/187_mlops-etl-pipeline/solution.py b/questions/187_mlops-etl-pipeline/solution.py
@@ -0,0 +1,43 @@
+from typing import List, Tuple
+
+
+def run_etl(csv_text: str) -> List[Tuple[str, float]]:
+	"""Reference ETL implementation.
+
+	- Extract: parse CSV text, skip header, strip whitespace, ignore blanks
+	- Transform: keep event_type == "purchase"; parse value as float; aggregate per user
+	- Load: return sorted list of (user_id, total_value) by user_id asc
+	"""
+	lines = [line.strip() for line in csv_text.splitlines() if line.strip()]
+	if not lines:
+		return []
+	# header
+	header = lines[0]
+	rows = lines[1:]
+
+	# indices from header (allow varying order and case)
+	headers = [h.strip().lower() for h in header.split(",")]
+	try:
+		idx_user = headers.index("user_id")
+		idx_event = headers.index("event_type")
+		idx_value = headers.index("value")
+	except ValueError:
+		# header missing required columns
+		return []
+
+	aggregates: dict[str, float] = {}
+	for row in rows:
+		parts = [c.strip() for c in row.split(",")]
+		if len(parts) <= max(idx_user, idx_event, idx_value):
+			continue
+		user_id = parts[idx_user]
+		event_type = parts[idx_event].lower()
+		if event_type != "purchase":
+			continue
+		try:
+			value = float(parts[idx_value])
+		except ValueError:
+			continue
+		aggregates[user_id] = aggregates.get(user_id, 0.0) + value
+
+	return sorted(aggregates.items(), key=lambda kv: kv[0])
diff --git a/questions/187_mlops-etl-pipeline/starter_code.py b/questions/187_mlops-etl-pipeline/starter_code.py
@@ -0,0 +1,9 @@
+# Implement your function below.
+
+def run_etl(csv_text: str) -> list[tuple[str, float]]:
+	"""Run a simple ETL pipeline over CSV text with header user_id,event_type,value.
+
+	Returns a sorted list of (user_id, total_value) for event_type == "purchase".
+	"""
+	# TODO: implement extract, transform, and load steps
+	raise NotImplementedError
diff --git a/questions/187_mlops-etl-pipeline/tests.json b/questions/187_mlops-etl-pipeline/tests.json
@@ -0,0 +1,14 @@
+[
+  {
+    "test": "print(run_etl('user_id,event_type,value\\n u1, purchase, 10.0\\n u2, view, 1.0\\n u1, purchase, 5\\n u3, purchase, not_a_number\\n u2, purchase, 3.5 \\n'))",
+    "expected_output": "[('u1', 15.0), ('u2', 3.5)]"
+  },
+  {
+    "test": "print(run_etl('user_id,event_type,value'))",
+    "expected_output": "[]"
+  },
+  {
+    "test": "print(run_etl('value,event_type,user_id\\n 1.0, purchase, u1\\n 2.0, purchase, u1\\n'))",
+    "expected_output": "[('u1', 3.0)]"
+  }
+]