Merge branch 'main' into new-Q-159

Open-Deep-ML · web-flow · commit 62dc13a3f520 · 2025-07-07T22:53:28.000-04:00
diff --git a/questions/158_epsilon-greedy-action-selection-for-n-armed-bandit/description.md b/questions/158_epsilon-greedy-action-selection-for-n-armed-bandit/description.md
@@ -0,0 +1 @@
+Implement the epsilon-greedy method for action selection in an n-armed bandit problem. Given a set of estimated action values (Q-values), select an action using the epsilon-greedy policy: with probability epsilon, choose a random action; with probability 1 - epsilon, choose the action with the highest estimated value.
diff --git a/questions/158_epsilon-greedy-action-selection-for-n-armed-bandit/example.json b/questions/158_epsilon-greedy-action-selection-for-n-armed-bandit/example.json
@@ -0,0 +1,5 @@
+{
+  "input": "Q = np.array([0.5, 2.3, 1.7])\nepsilon = 0.0\naction = epsilon_greedy(Q, epsilon)\nprint(action)",
+  "output": "1",
+  "reasoning": "With epsilon=0.0 (always greedy), the highest Q-value is 2.3 at index 1, so the function always returns 1."
+}
diff --git a/questions/158_epsilon-greedy-action-selection-for-n-armed-bandit/learn.md b/questions/158_epsilon-greedy-action-selection-for-n-armed-bandit/learn.md
@@ -0,0 +1,8 @@
+### Epsilon-Greedy Policy
+
+The epsilon-greedy method is a fundamental action selection strategy used in reinforcement learning, especially for solving the n-armed bandit problem. The key idea is to balance **exploration** (trying new actions) and **exploitation** (choosing the best-known action):
+
+- With probability $\varepsilon$ (epsilon), the agent explores by selecting an action at random.
+- With probability $1-\varepsilon$, it exploits by choosing the action with the highest estimated value (greedy choice).
+
+The epsilon-greedy policy is simple to implement and provides a way to avoid getting stuck with suboptimal actions due to insufficient exploration.
diff --git a/questions/158_epsilon-greedy-action-selection-for-n-armed-bandit/meta.json b/questions/158_epsilon-greedy-action-selection-for-n-armed-bandit/meta.json
@@ -0,0 +1,10 @@
+{
+  "id": "158",
+  "title": "Epsilon-Greedy Action Selection for n-Armed Bandit",
+  "difficulty": "easy",
+  "category": "Reinforcement Learning",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": []
+}
diff --git a/questions/158_epsilon-greedy-action-selection-for-n-armed-bandit/solution.py b/questions/158_epsilon-greedy-action-selection-for-n-armed-bandit/solution.py
@@ -0,0 +1,7 @@
+import numpy as np
+
+def epsilon_greedy(Q, epsilon=0.1):
+    if np.random.rand() < epsilon:
+        return np.random.randint(len(Q))
+    else:
+        return int(np.argmax(Q))
diff --git a/questions/158_epsilon-greedy-action-selection-for-n-armed-bandit/starter_code.py b/questions/158_epsilon-greedy-action-selection-for-n-armed-bandit/starter_code.py
@@ -0,0 +1,11 @@
+import numpy as np
+
+def epsilon_greedy(Q, epsilon=0.1):
+    """
+    Selects an action using epsilon-greedy policy.
+    Q: np.ndarray of shape (n,) -- estimated action values
+    epsilon: float in [0, 1]
+    Returns: int, selected action index
+    """
+    # Your code here
+    pass
diff --git a/questions/158_epsilon-greedy-action-selection-for-n-armed-bandit/tests.json b/questions/158_epsilon-greedy-action-selection-for-n-armed-bandit/tests.json
@@ -0,0 +1,14 @@
+[
+  {
+    "test": "import numpy as np\nnp.random.seed(0)\nprint([epsilon_greedy(np.array([1, 2, 3]), epsilon=0.0) for _ in range(5)])",
+    "expected_output": "[2, 2, 2, 2, 2]"
+  },
+  {
+    "test": "import numpy as np\nnp.random.seed(1)\nprint([epsilon_greedy(np.array([5, 2, 1]), epsilon=1.0) for _ in range(5)])",
+    "expected_output": "[0, 1, 1, 0, 0]"
+  },
+  {
+    "test": "import numpy as np\nnp.random.seed(42)\nresults = [epsilon_greedy(np.array([1.5, 2.5, 0.5]), epsilon=0.5) for _ in range(10)]\nprint(results)",
+    "expected_output": "[1, 0, 1, 1, 1, 0, 1, 0, 0, 0]"
+  }
+]
diff --git a/utils/convert_single_question.py b/utils/convert_single_question.py
@@ -29,33 +29,7 @@
 # ── 1️⃣  EDIT YOUR QUESTION HERE ────────────────────────────────────────────
 QUESTION_DICT: Dict[str, Any] = {
   "id":'158',
-  "title": "Incremental Mean for Online Reward Estimation",
-  "description": "Implement an efficient method to update the mean reward for a k-armed bandit action after receiving each new reward, **without storing the full history of rewards**. Given the previous mean estimate (Q_prev), the number of times the action has been selected (k), and a new reward (R), compute the updated mean using the incremental formula.\n\n**Note:** Using a regular mean that stores all past rewards will eventually run out of memory. Your solution should use only the previous mean, the count, and the new reward.",
-  "category": "Reinforcement Learning",
-  "difficulty": "easy",
-  "starter_code": "def incremental_mean(Q_prev, k, R):\n    \"\"\"\n    Q_prev: previous mean estimate (float)\n    k: number of times the action has been selected (int)\n    R: new observed reward (float)\n    Returns: new mean estimate (float)\n    \"\"\"\n    # Your code here\n    pass\n",
-  "solution": "def incremental_mean(Q_prev, k, R):\n    return Q_prev + (1 / k) * (R - Q_prev)",
-  "test_cases": [
-    {
-      "test": "Q = 0.0\nk = 1\nR = 5.0\nprint(round(incremental_mean(Q, k, R), 4))",
-      "expected_output": "5.0"
-    },
-    {
-      "test": "Q = 5.0\nk = 2\nR = 7.0\nprint(round(incremental_mean(Q, k, R), 4))",
-      "expected_output": "6.0"
-    },
-    {
-      "test": "Q = 6.0\nk = 3\nR = 4.0\nprint(round(incremental_mean(Q, k, R), 4))",
-      "expected_output": "5.3333"
-    }
-  ],
-  "example": {
-    "input": "Q_prev = 2.0\nk = 2\nR = 6.0\nnew_Q = incremental_mean(Q_prev, k, R)\nprint(round(new_Q, 2))",
-    "output": "4.0",
-    "reasoning": "The updated mean is Q_prev + (1/k) * (R - Q_prev) = 2.0 + (1/2)*(6.0 - 2.0) = 2.0 + 2.0 = 4.0"
-  },
-  "learn_section": "### Incremental Mean Update Rule\n\nThe incremental mean formula lets you update your estimate of the mean after each new observation, **without keeping all previous rewards in memory**. For the k-th reward $R_k$ and previous estimate $Q_{k}$:\n\n$$\nQ_{k+1} = Q_k + \\frac{1}{k} (R_k - Q_k)\n$$\n\nThis saves memory compared to the regular mean, which requires storing all past rewards and recalculating each time. The incremental rule is crucial for online learning and large-scale problems where storing all data is impractical."
-}
+
 
 
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Implement the epsilon-greedy method for action selection in an n-armed bandit problem. Given a set of estimated action values (Q-values), select an action using the epsilon-greedy policy: with probability epsilon, choose a random action; with probability 1 - epsilon, choose the action with the highest estimated value.`