|
| 1 | +#!/usr/bin/env python |
| 2 | +""" |
| 3 | +convert_single_question.py |
| 4 | +────────────────────────── |
| 5 | +Paste ONE question dict into QUESTION_DICT and run: |
| 6 | +
|
| 7 | + python utils/convert_single_question.py |
| 8 | +
|
| 9 | +The script splits it into: |
| 10 | +
|
| 11 | +questions/<id>_<slug>/ |
| 12 | + ├─ meta.json |
| 13 | + ├─ description.md |
| 14 | + ├─ learn.md |
| 15 | + ├─ starter_code.py |
| 16 | + ├─ solution.py |
| 17 | + ├─ example.json |
| 18 | + ├─ tests.json |
| 19 | + ├─ tinygrad/ (optional) |
| 20 | + └─ pytorch/ (optional) |
| 21 | +""" |
| 22 | + |
| 23 | +import base64 |
| 24 | +import json |
| 25 | +import pathlib |
| 26 | +import re |
| 27 | +from typing import Any, Dict |
| 28 | + |
| 29 | +# ── 1️⃣ EDIT YOUR QUESTION HERE ──────────────────────────────────────────── |
| 30 | +QUESTION_DICT: Dict[str, Any] = { |
| 31 | + 'id':'140', |
| 32 | + "description": "Write a Python class to implement the Bernoulli Naive Bayes classifier for binary (0/1) feature data. Your class should have two methods: `forward(self, X, y)` to train on the input data (X: 2D NumPy array of binary features, y: 1D NumPy array of class labels) and `predict(self, X)` to output predicted labels for a 2D test matrix X. Use Laplace smoothing (parameter: smoothing=1.0). Return predictions as a NumPy array. Only use NumPy. Predictions must be binary (0 or 1) and you must handle cases where the training data contains only one class. All log/likelihood calculations should use log probabilities for numerical stability.", |
| 33 | + "test_cases": [ |
| 34 | + { |
| 35 | + "test": "import numpy as np\nmodel = NaiveBayes(smoothing=1.0)\nX = np.array([[1, 0, 1], [1, 1, 0], [0, 0, 1], [0, 1, 0], [1, 1, 1]])\ny = np.array([1, 1, 0, 0, 1])\nmodel.forward(X, y)\nprint(model.predict(np.array([[1, 0, 1]])))", |
| 36 | + "expected_output": "[1]" |
| 37 | + }, |
| 38 | + { |
| 39 | + "test": "import numpy as np\nmodel = NaiveBayes(smoothing=1.0)\nX = np.array([[0], [1], [0], [1]])\ny = np.array([0, 1, 0, 1])\nmodel.forward(X, y)\nprint(model.predict(np.array([[0], [1]])))", |
| 40 | + "expected_output": "[0 1]" |
| 41 | + }, |
| 42 | + { |
| 43 | + "test": "import numpy as np\nmodel = NaiveBayes(smoothing=1.0)\nX = np.array([[0, 0], [1, 0], [0, 1]])\ny = np.array([0, 1, 0])\nmodel.forward(X, y)\nprint(model.predict(np.array([[1, 1]])))", |
| 44 | + "expected_output": "[0]" |
| 45 | + }, |
| 46 | + { |
| 47 | + "test": "import numpy as np\nnp.random.seed(42)\nmodel = NaiveBayes(smoothing=1.0)\nX = np.random.randint(0, 2, (100, 5))\ny = np.random.choice([0, 1], size=100)\nmodel.forward(X, y)\nX_test = np.random.randint(0, 2, (10, 5))\npred = model.predict(X_test)\nprint(pred.shape)", |
| 48 | + "expected_output": "(10,)" |
| 49 | + }, |
| 50 | + { |
| 51 | + "test": "import numpy as np\nmodel = NaiveBayes(smoothing=1.0)\nX = np.random.randint(0, 2, (10, 3))\ny = np.zeros(10)\nmodel.forward(X, y)\nX_test = np.random.randint(0, 2, (3, 3))\nprint(model.predict(X_test))", |
| 52 | + "expected_output": "[0, 0, 0]" |
| 53 | + } |
| 54 | + ], |
| 55 | + "solution": "import numpy as np\n\nclass NaiveBayes():\n def __init__(self, smoothing=1.0):\n self.smoothing = smoothing\n self.classes = None\n self.priors = None\n self.likelihoods = None\n\n def forward(self, X, y):\n self.classes, class_counts = np.unique(y, return_counts=True)\n self.priors = {cls: np.log(class_counts[i] / len(y)) for i, cls in enumerate(self.classes)}\n self.likelihoods = {}\n for cls in self.classes:\n X_cls = X[y == cls]\n prob = (np.sum(X_cls, axis=0) + self.smoothing) / (X_cls.shape[0] + 2 * self.smoothing)\n self.likelihoods[cls] = (np.log(prob), np.log(1 - prob))\n\n def _compute_posterior(self, sample):\n posteriors = {}\n for cls in self.classes:\n posterior = self.priors[cls]\n prob_1, prob_0 = self.likelihoods[cls]\n likelihood = np.sum(sample * prob_1 + (1 - sample) * prob_0)\n posterior += likelihood\n posteriors[cls] = posterior\n return max(posteriors, key=posteriors.get)\n\n def predict(self, X):\n return np.array([self._compute_posterior(sample) for sample in X])", |
| 56 | + "example": { |
| 57 | + "input": "X = np.array([[1, 0, 1], [1, 1, 0], [0, 0, 1], [0, 1, 0], [1, 1, 1]]); y = np.array([1, 1, 0, 0, 1])\nmodel = NaiveBayes(smoothing=1.0)\nmodel.forward(X, y)\nprint(model.predict(np.array([[1, 0, 1]])))", |
| 58 | + "output": "[1]", |
| 59 | + "reasoning": "The model learns class priors and feature probabilities with Laplace smoothing. For [1, 0, 1], the posterior for class 1 is higher, so the model predicts 1." |
| 60 | + }, |
| 61 | + "category": "Machine Learning", |
| 62 | + "starter_code": "import numpy as np\n\nclass NaiveBayes():\n def __init__(self, smoothing=1.0):\n # Initialize smoothing\n pass\n\n def forward(self, X, y):\n # Fit model to binary features X and labels y\n pass\n\n def predict(self, X):\n # Predict class labels for test set X\n pass", |
| 63 | + "title": "Bernoulli Naive Bayes Classifier", |
| 64 | + "learn_section":r"""# **Naive Bayes Classifier** |
| 65 | +
|
| 66 | +## **1. Definition** |
| 67 | +
|
| 68 | +Naive Bayes is a **probabilistic machine learning algorithm** used for **classification tasks**. It is based on **Bayes' Theorem**, which describes the probability of an event based on prior knowledge of related events. |
| 69 | +
|
| 70 | +The algorithm assumes that: |
| 71 | +- **Features are conditionally independent** given the class label (the "naive" assumption). |
| 72 | +- It calculates the posterior probability for each class and assigns the class with the **highest posterior** to the sample. |
| 73 | +
|
| 74 | +--- |
| 75 | +
|
| 76 | +## **2. Bayes' Theorem** |
| 77 | +
|
| 78 | +Bayes' Theorem is given by: |
| 79 | +
|
| 80 | +$$ |
| 81 | +P(C | X) = \frac{P(X | C) \times P(C)}{P(X)} |
| 82 | +$$ |
| 83 | +
|
| 84 | +Where: |
| 85 | +- $P(C | X)$ **Posterior** probability: the probability of class $C $ given the feature vector $X$ |
| 86 | +- $P(X | C)$ → **Likelihood**: the probability of the data $X$ given the class |
| 87 | +- $P(C)$ → **Prior** probability: the initial probability of class $C$ before observing any data |
| 88 | +- $ P(X)$ → **Evidence**: the total probability of the data across all classes (acts as a normalizing constant) |
| 89 | +
|
| 90 | +Since $P(X)$ is the same for all classes during comparison, it can be ignored, simplifying the formula to: |
| 91 | +
|
| 92 | +$$ |
| 93 | +P(C | X) \propto P(X | C) \times P(C) |
| 94 | +$$ |
| 95 | +--- |
| 96 | +
|
| 97 | +### 3 **Bernoulli Naive Bayes** |
| 98 | +- Used for **binary data** (features take only 0 or 1 values). |
| 99 | +- The likelihood is given by: |
| 100 | +
|
| 101 | +$$ |
| 102 | +P(X | C) = \prod_{i=1}^{n} P(x_i | C)^{x_i} \cdot (1 - P(x_i | C))^{1 - x_i} |
| 103 | +$$ |
| 104 | +
|
| 105 | +--- |
| 106 | +
|
| 107 | +## **4. Applications of Naive Bayes** |
| 108 | +
|
| 109 | +- **Text Classification:** Spam detection, sentiment analysis, and news categorization. |
| 110 | +- **Document Categorization:** Sorting documents by topic. |
| 111 | +- **Fraud Detection:** Identifying fraudulent transactions or behaviors. |
| 112 | +- **Recommender Systems:** Classifying users into preference groups. |
| 113 | +
|
| 114 | +--- """, |
| 115 | + "contributor": [ |
| 116 | + { |
| 117 | + "profile_link": "https://github.com/moe18", |
| 118 | + "name": "Moe Chabot" |
| 119 | + } |
| 120 | + ], |
| 121 | + "likes": "0", |
| 122 | + "dislikes": "0", |
| 123 | + "difficulty": "medium", |
| 124 | + "video":'' |
| 125 | +} |
| 126 | + |
| 127 | +# ──────────────────────────────────────────────────────────────────────────── |
| 128 | + |
| 129 | + |
| 130 | +# ---------- helpers --------------------------------------------------------- |
| 131 | +def slugify(text: str) -> str: |
| 132 | + text = re.sub(r"[^0-9A-Za-z]+", "-", text.lower()) |
| 133 | + return re.sub(r"-{2,}", "-", text).strip("-")[:50] |
| 134 | + |
| 135 | + |
| 136 | +def maybe_b64(s: str) -> str: |
| 137 | + try: |
| 138 | + if len(s) % 4 == 0 and re.fullmatch(r"[0-9A-Za-z+/=\n\r]+", s): |
| 139 | + return base64.b64decode(s).decode("utf-8") |
| 140 | + except Exception: |
| 141 | + pass |
| 142 | + return s |
| 143 | + |
| 144 | + |
| 145 | +def write_text(path: pathlib.Path, content: str) -> None: |
| 146 | + path.parent.mkdir(parents=True, exist_ok=True) |
| 147 | + path.write_text(content.rstrip("\n") + "\n", encoding="utf-8") |
| 148 | + |
| 149 | + |
| 150 | +def write_json(path: pathlib.Path, obj: Any) -> None: |
| 151 | + write_text(path, json.dumps(obj, indent=2, ensure_ascii=False)) |
| 152 | + |
| 153 | + |
| 154 | +# ---------- converter ------------------------------------------------------- |
| 155 | +def convert_one(q: Dict[str, Any]) -> None: |
| 156 | + folder = pathlib.Path("questions") / f"{q['id']}_{slugify(q['title'])}" |
| 157 | + folder.mkdir(parents=True, exist_ok=True) |
| 158 | + |
| 159 | + # meta.json |
| 160 | + meta = { |
| 161 | + "id": q["id"], |
| 162 | + "title": q["title"], |
| 163 | + "difficulty": q["difficulty"], |
| 164 | + "category": q["category"], |
| 165 | + "video": q.get("video", ""), |
| 166 | + "likes": q.get("likes", "0"), |
| 167 | + "dislikes": q.get("dislikes", "0"), |
| 168 | + "contributor": q.get("contributor", []), |
| 169 | + } |
| 170 | + for opt in ("tinygrad_difficulty", "pytorch_difficulty", "marimo_link"): |
| 171 | + if opt in q: |
| 172 | + meta[opt] = q[opt] |
| 173 | + write_json(folder / "meta.json", meta) |
| 174 | + |
| 175 | + # core files |
| 176 | + write_text(folder / "description.md", q["description"]) |
| 177 | + write_text(folder / "learn.md", q["learn_section"]) |
| 178 | + write_text(folder / "starter_code.py", q["starter_code"]) |
| 179 | + write_text(folder / "solution.py", q["solution"]) |
| 180 | + write_json(folder / "example.json", q["example"]) |
| 181 | + write_json(folder / "tests.json", q["test_cases"]) |
| 182 | + |
| 183 | + # optional language-specific extras |
| 184 | + for lang in ("tinygrad", "pytorch"): |
| 185 | + sc, so, tc = (f"{lang}_starter_code", f"{lang}_solution", f"{lang}_test_cases") |
| 186 | + if any(k in q for k in (sc, so, tc)): |
| 187 | + sub = folder / lang |
| 188 | + if sc in q: |
| 189 | + write_text(sub / "starter_code.py", maybe_b64(q[sc])) |
| 190 | + if so in q: |
| 191 | + write_text(sub / "solution.py", maybe_b64(q[so])) |
| 192 | + if tc in q: |
| 193 | + write_json(sub / "tests.json", q[tc]) |
| 194 | + |
| 195 | + # success message (relative if possible) |
| 196 | + try: |
| 197 | + rel = folder.relative_to(pathlib.Path.cwd()) |
| 198 | + except ValueError: |
| 199 | + rel = folder |
| 200 | + print(f"✓ Created {rel}") |
| 201 | + |
| 202 | + |
| 203 | +# ---------- main ------------------------------------------------------------ |
| 204 | +def main() -> None: |
| 205 | + convert_one(QUESTION_DICT) |
| 206 | + |
| 207 | + |
| 208 | +if __name__ == "__main__": |
| 209 | + main() |
0 commit comments