Merge pull request #503 from Open-Deep-ML/new-Q-159

Open-Deep-ML · web-flow · commit 7291a94de383 · 2025-07-07T22:53:37.000-04:00
added new Q
diff --git a/questions/158_incremental-mean-for-online-reward-estimation/description.md b/questions/158_incremental-mean-for-online-reward-estimation/description.md
@@ -0,0 +1,3 @@
+Implement an efficient method to update the mean reward for a k-armed bandit action after receiving each new reward, **without storing the full history of rewards**. Given the previous mean estimate (Q_prev), the number of times the action has been selected (k), and a new reward (R), compute the updated mean using the incremental formula.
+
+**Note:** Using a regular mean that stores all past rewards will eventually run out of memory. Your solution should use only the previous mean, the count, and the new reward.
diff --git a/questions/158_incremental-mean-for-online-reward-estimation/example.json b/questions/158_incremental-mean-for-online-reward-estimation/example.json
@@ -0,0 +1,5 @@
+{
+  "input": "Q_prev = 2.0\nk = 2\nR = 6.0\nnew_Q = incremental_mean(Q_prev, k, R)\nprint(round(new_Q, 2))",
+  "output": "4.0",
+  "reasoning": "The updated mean is Q_prev + (1/k) * (R - Q_prev) = 2.0 + (1/2)*(6.0 - 2.0) = 2.0 + 2.0 = 4.0"
+}
diff --git a/questions/158_incremental-mean-for-online-reward-estimation/learn.md b/questions/158_incremental-mean-for-online-reward-estimation/learn.md
@@ -0,0 +1,9 @@
+### Incremental Mean Update Rule
+
+The incremental mean formula lets you update your estimate of the mean after each new observation, **without keeping all previous rewards in memory**. For the k-th reward $R_k$ and previous estimate $Q_{k}$:
+
+$$
+Q_{k+1} = Q_k + \frac{1}{k} (R_k - Q_k)
+$$
+
+This saves memory compared to the regular mean, which requires storing all past rewards and recalculating each time. The incremental rule is crucial for online learning and large-scale problems where storing all data is impractical.
diff --git a/questions/158_incremental-mean-for-online-reward-estimation/meta.json b/questions/158_incremental-mean-for-online-reward-estimation/meta.json
@@ -0,0 +1,10 @@
+{
+  "id": "158",
+  "title": "Incremental Mean for Online Reward Estimation",
+  "difficulty": "easy",
+  "category": "Reinforcement Learning",
+  "video": "",
+  "likes": "0",
+  "dislikes": "0",
+  "contributor": []
+}
diff --git a/questions/158_incremental-mean-for-online-reward-estimation/solution.py b/questions/158_incremental-mean-for-online-reward-estimation/solution.py
@@ -0,0 +1,2 @@
+def incremental_mean(Q_prev, k, R):
+    return Q_prev + (1 / k) * (R - Q_prev)
diff --git a/questions/158_incremental-mean-for-online-reward-estimation/starter_code.py b/questions/158_incremental-mean-for-online-reward-estimation/starter_code.py
@@ -0,0 +1,9 @@
+def incremental_mean(Q_prev, k, R):
+    """
+    Q_prev: previous mean estimate (float)
+    k: number of times the action has been selected (int)
+    R: new observed reward (float)
+    Returns: new mean estimate (float)
+    """
+    # Your code here
+    pass
diff --git a/questions/158_incremental-mean-for-online-reward-estimation/tests.json b/questions/158_incremental-mean-for-online-reward-estimation/tests.json
@@ -0,0 +1,14 @@
+[
+  {
+    "test": "Q = 0.0\nk = 1\nR = 5.0\nprint(round(incremental_mean(Q, k, R), 4))",
+    "expected_output": "5.0"
+  },
+  {
+    "test": "Q = 5.0\nk = 2\nR = 7.0\nprint(round(incremental_mean(Q, k, R), 4))",
+    "expected_output": "6.0"
+  },
+  {
+    "test": "Q = 6.0\nk = 3\nR = 4.0\nprint(round(incremental_mean(Q, k, R), 4))",
+    "expected_output": "5.3333"
+  }
+]
diff --git a/utils/convert_single_question.py b/utils/convert_single_question.py
@@ -29,33 +29,7 @@
 # ── 1️⃣  EDIT YOUR QUESTION HERE ────────────────────────────────────────────
 QUESTION_DICT: Dict[str, Any] = {
   "id":'158',
-  "title": "Epsilon-Greedy Action Selection for n-Armed Bandit",
-  "description": "Implement the epsilon-greedy method for action selection in an n-armed bandit problem. Given a set of estimated action values (Q-values), select an action using the epsilon-greedy policy: with probability epsilon, choose a random action; with probability 1 - epsilon, choose the action with the highest estimated value.",
-  "category": "Reinforcement Learning",
-  "difficulty": "easy",
-  "starter_code": "import numpy as np\n\ndef epsilon_greedy(Q, epsilon=0.1):\n    \"\"\"\n    Selects an action using epsilon-greedy policy.\n    Q: np.ndarray of shape (n,) -- estimated action values\n    epsilon: float in [0, 1]\n    Returns: int, selected action index\n    \"\"\"\n    # Your code here\n    pass",
-  "solution": "import numpy as np\n\ndef epsilon_greedy(Q, epsilon=0.1):\n    if np.random.rand() < epsilon:\n        return np.random.randint(len(Q))\n    else:\n        return int(np.argmax(Q))",
-  "test_cases": [
-    {
-      "test": "import numpy as np\nnp.random.seed(0)\nprint([epsilon_greedy(np.array([1, 2, 3]), epsilon=0.0) for _ in range(5)])",
-      "expected_output": "[2, 2, 2, 2, 2]"
-    },
-    {
-      "test": "import numpy as np\nnp.random.seed(1)\nprint([epsilon_greedy(np.array([5, 2, 1]), epsilon=1.0) for _ in range(5)])",
-      "expected_output": "[0, 1, 1, 0, 0]"
-    },
-    {
-      "test": "import numpy as np\nnp.random.seed(42)\nresults = [epsilon_greedy(np.array([1.5, 2.5, 0.5]), epsilon=0.5) for _ in range(10)]\nprint(results)",
-      "expected_output": "[1, 0, 1, 1, 1, 0, 1, 0, 0, 0]"
-    }
-  ],
-  "example": {
-    "input": "Q = np.array([0.5, 2.3, 1.7])\nepsilon = 0.0\naction = epsilon_greedy(Q, epsilon)\nprint(action)",
-    "output": "1",
-    "reasoning": "With epsilon=0.0 (always greedy), the highest Q-value is 2.3 at index 1, so the function always returns 1."
-  },
-  "learn_section": "### Epsilon-Greedy Policy\n\nThe epsilon-greedy method is a fundamental action selection strategy used in reinforcement learning, especially for solving the n-armed bandit problem. The key idea is to balance **exploration** (trying new actions) and **exploitation** (choosing the best-known action):\n\n- With probability $\\varepsilon$ (epsilon), the agent explores by selecting an action at random.\n- With probability $1-\\varepsilon$, it exploits by choosing the action with the highest estimated value (greedy choice).\n\nThe epsilon-greedy policy is simple to implement and provides a way to avoid getting stuck with suboptimal actions due to insufficient exploration."
-}
+
 
 
 

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+Implement an efficient method to update the mean reward for a k-armed bandit action after receiving each new reward, without storing the full history of rewards. Given the previous mean estimate (Q_prev), the number of times the action has been selected (k), and a new reward (R), compute the updated mean using the incremental formula.`
	`2`	`+`
	`3`	`+Note: Using a regular mean that stores all past rewards will eventually run out of memory. Your solution should use only the previous mean, the count, and the new reward.`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+def incremental_mean(Q_prev, k, R):`
	`2`	`+ return Q_prev + (1 / k) * (R - Q_prev)`