From 18075c14812339003c96d9d4420950e7fd6f6e16 Mon Sep 17 00:00:00 2001
From: Christopher Aicher <aicherc@uw.edu>
Date: Tue, 14 Mar 2017 14:22:38 -0700
Subject: [PATCH 1/3] Moved `strategy` into subfolder and added
 `estimated_arm_sds`

---
 strategy/__init__.py                          |   2 +
 strategy.py => strategy/strategy.py           | 116 +------------
 strategy/thompson_sampling_strategy.py        | 152 ++++++++++++++++++
 ...sampling.py => thompson_sampling_script.py |   6 +-
 4 files changed, 158 insertions(+), 118 deletions(-)
 create mode 100644 strategy/__init__.py
 rename strategy.py => strategy/strategy.py (62%)
 create mode 100644 strategy/thompson_sampling_strategy.py
 rename thompson_sampling.py => thompson_sampling_script.py (90%)

diff --git a/strategy/__init__.py b/strategy/__init__.py
new file mode 100644
index 0000000..3e3c555
--- /dev/null
+++ b/strategy/__init__.py
@@ -0,0 +1,2 @@
+from strategy.strategy import *
+from strategy.thompson_sampling_strategy import *
diff --git a/strategy.py b/strategy/strategy.py
similarity index 62%
rename from strategy.py
rename to strategy/strategy.py
index 2362fb8..0402f3c 100644
--- a/strategy.py
+++ b/strategy/strategy.py
@@ -1,12 +1,12 @@
 """
 
+Base Strategy Class
 
 """
 
 import numpy as np
 import scipy.stats
 from copy import deepcopy
-import matplotlib.pyplot as plt
 
 class Strategy:
     """ Base Strategy Class
@@ -36,120 +36,6 @@ def fit(self, iterations, **kwargs):
         raise NotImplementedError
 
 
-class ThompsonSampling(Strategy):
-    def __init__(self, bandit, **kwargs):
-        self.bandit = bandit
-        self.num_arms = bandit.num_arms
-        self.prior_params = [deepcopy(kwargs) for _ in range(self.num_arms)]
-        self.posterior_params = deepcopy(self.prior_params)
-
-    def fit(self, iterations, restart=False, plot=False):
-        if restart == True:
-            self._restart()
-
-        index_arms_pulled = [None] * iterations
-        observed_rewards = [None] * iterations
-        mean_reward_estimates = [None] * iterations
-        for i in range(iterations):
-            index_arms_pulled[i] = self._choose_arm()
-            observed_rewards[i] = self.pull_arm(index_arms_pulled[i])
-            self._update_posterior(index_arms_pulled[i], observed_rewards[i])
-            mean_reward_estimates[i] = self.mean_reward_estimates
-
-        if plot == True:
-            plt.close()
-            plt.plot(mean_reward_estimates)
-            plt.show()
-
-        out = {
-            'rewards': observed_rewards,
-            'arms_pulled': index_arms_pulled,
-            'estimated_arm_means': mean_reward_estimates
-        }
-        return out
-
-    def _restart(self):
-        self.posterior_params = deepcopy(self.prior_params)
-
-    def _choose_arm(self):
-        samples = [self._sample(**params) for params in self.posterior_params]
-        return np.argmax(samples)
-
-    def pull_arm(self, arm_index):
-        return self.bandit.pull_arm(arm_index)
-
-    def _sample(self, **kwargs):
-        raise NotImplementedError
-
-    def _update_posterior(self, arm_index, observed_reward):
-        raise NotImplementedError
-
-    @property
-    def mean_reward_estimates(self):
-        raise NotImplementedError
-
-
-class ThompsonBernoulli(ThompsonSampling):
-    """asdf"""
-
-    def __init__(self, bandit, alpha_prior, beta_prior):
-        ThompsonSampling.__init__(self, bandit, alpha=alpha_prior, beta=beta_prior)
-
-    def _sample(self, alpha, beta):
-        return scipy.stats.beta.rvs(alpha, beta, size=1)
-
-    def _update_posterior(self, arm_index, observed_reward):
-        if observed_reward == 1:
-            self.posterior_params[arm_index]['alpha'] += 1
-        else:
-            self.posterior_params[arm_index]['beta'] += 1
-
-    @property
-    def mean_reward_estimates(self):
-        return [params['alpha'] / (params['alpha'] + params['beta']) for params
-                in self.posterior_params]
-
-
-class ThompsonGaussianKnownSigma(ThompsonSampling):
-    """asdf """
-
-    def __init__(self, bandit, sigma, mu_prior, sigma_prior, memory_multiplier=0.9):
-        ThompsonSampling.__init__(self, bandit, mu=mu_prior, sigma2=sigma_prior ** 2)
-        self.sigma2 = sigma ** 2
-        self.sufficient_statistics = [{'n': 0, 'xsum': 0} for _ in range(self.num_arms)]
-        self.memory_multiplier = memory_multiplier
-
-    def _sample(self, mu, sigma2):
-        return np.random.normal(loc=mu, scale=np.sqrt(sigma2))
-
-    def _update_posterior(self, arm_index, observed_reward):
-        sigma2 = self.sigma2
-
-        for index in range(self.num_arms):
-            old_n = self.sufficient_statistics[index]['n']
-            old_xsum = self.sufficient_statistics[index]['xsum']
-            mu_prior = self.prior_params[index]['mu']
-            sigma2_prior = self.prior_params[index]['sigma2']
-
-            if index == arm_index:
-                new_n = self.memory_multiplier * old_n + 1
-                new_xsum = self.memory_multiplier * old_xsum + observed_reward
-            else:
-                new_n = self.memory_multiplier * old_n
-                new_xsum = self.memory_multiplier * old_xsum
-
-            new_sigma2_posterior = 1 / (1 / sigma2_prior + new_n / sigma2)
-            new_mu_posterior = (mu_prior / sigma2_prior + new_xsum / sigma2) * new_sigma2_posterior
-
-            self.sufficient_statistics[index]['n'] = new_n
-            self.sufficient_statistics[index]['xsum'] = new_xsum
-            self.posterior_params[index]['mu'] = new_mu_posterior
-            self.posterior_params[index]['sigma2'] = new_sigma2_posterior
-
-    @property
-    def mean_reward_estimates(self):
-        return [params['mu'] for params in self.posterior_params]
-
 class EpsilonGreedy(Strategy):
     """ Epislon Greedy Strategy
 
diff --git a/strategy/thompson_sampling_strategy.py b/strategy/thompson_sampling_strategy.py
new file mode 100644
index 0000000..e5ff3db
--- /dev/null
+++ b/strategy/thompson_sampling_strategy.py
@@ -0,0 +1,152 @@
+"""
+
+ThompsonSampling Strategy
+
+"""
+
+import numpy as np
+import scipy.stats
+from copy import deepcopy
+from strategy.strategy import Strategy
+
+
+class ThompsonSampling(Strategy):
+    """ Thompson Sampling Strategy
+
+    Args:
+        bandit (Bandit): bandit
+        **kwargs : prior parameters
+
+    Attributes:
+        num_arms (int)
+        posterior_params (list of list): posterior parameters
+        estimated_arm_means (ndarray): posterior predictive mean
+        estimated_arm_sds (ndarray): posterior predictive standard deviation
+    """
+    def __init__(self, bandit, **kwargs):
+        self.bandit = bandit
+        self.num_arms = bandit.num_arms
+        self.prior_params = [deepcopy(kwargs) for _ in range(self.num_arms)]
+        self.posterior_params = deepcopy(self.prior_params)
+
+    def fit(self, iterations, restart=False):
+        if restart == True:
+            self._restart()
+
+        index_arms_pulled = [None] * iterations
+        observed_rewards = [None] * iterations
+        estimated_arm_means = [None] * iterations
+        estimated_arm_sds = [None] * iterations
+        for i in range(iterations):
+            index_arms_pulled[i] = self._choose_arm()
+            observed_rewards[i] = self.pull_arm(index_arms_pulled[i])
+            self._update_posterior(index_arms_pulled[i], observed_rewards[i])
+            estimated_arm_means[i] = self.estimated_arm_means
+            estimated_arm_sds[i] = self.estimated_arm_sds
+
+        out = {
+            'rewards': np.array(observed_rewards),
+            'arms_pulled': np.array(index_arms_pulled),
+            'estimated_arm_means': np.array(estimated_arm_means),
+            'estimated_arm_sds': np.array(estimated_arm_sds)
+        }
+        return out
+
+    def _restart(self):
+        self.posterior_params = deepcopy(self.prior_params)
+
+    def _choose_arm(self):
+        samples = [self._sample(**params) for params in self.posterior_params]
+        return np.argmax(samples)
+
+    def pull_arm(self, arm_index):
+        return self.bandit.pull_arm(arm_index)
+
+    def _sample(self, **kwargs):
+        raise NotImplementedError
+
+    def _update_posterior(self, arm_index, observed_reward):
+        raise NotImplementedError
+
+    @property
+    def estimated_arm_means(self):
+        raise NotImplementedError
+
+    @property
+    def estimated_arm_sds(self):
+        raise NotImplementedError
+
+
+class ThompsonBernoulli(ThompsonSampling):
+    def __init__(self, bandit, alpha_prior, beta_prior):
+        ThompsonSampling.__init__(self, bandit, alpha=alpha_prior, beta=beta_prior)
+
+    def _sample(self, alpha, beta):
+        return scipy.stats.beta.rvs(alpha, beta, size=1)
+
+    def _update_posterior(self, arm_index, observed_reward):
+        if observed_reward == 1:
+            self.posterior_params[arm_index]['alpha'] += 1
+        else:
+            self.posterior_params[arm_index]['beta'] += 1
+
+    @property
+    def estimated_arm_means(self):
+        mean = lambda params: \
+                params['alpha'] / (params['alpha'] + params['beta'])
+        return np.array([ mean(params) for params in self.posterior_params])
+    @property
+    def estimated_arm_sds(self):
+        sd = lambda params: np.sqrt(
+                    (params['alpha'] / (params['alpha'] + params['beta'])) *
+                    (1 - params['alpha'] / (params['alpha'] + params['beta']))
+                )
+        return np.array([sd(params) for params in self.posterior_params])
+
+class ThompsonGaussianKnownSigma(ThompsonSampling):
+    def __init__(self, bandit, sigma, mu_prior, sigma_prior,
+            memory_multiplier=0.9):
+        ThompsonSampling.__init__(self, bandit, mu=mu_prior,
+                sigma2=sigma_prior ** 2)
+        self.sigma2 = sigma ** 2
+        self.sufficient_statistics = [{'n': 0, 'xsum': 0} for _ in range(self.num_arms)]
+        self.memory_multiplier = memory_multiplier
+
+    def _sample(self, mu, sigma2):
+        return np.random.normal(loc=mu, scale=np.sqrt(sigma2))
+
+    def _update_posterior(self, arm_index, observed_reward):
+        sigma2 = self.sigma2
+
+        for index in range(self.num_arms):
+            old_n = self.sufficient_statistics[index]['n']
+            old_xsum = self.sufficient_statistics[index]['xsum']
+            mu_prior = self.prior_params[index]['mu']
+            sigma2_prior = self.prior_params[index]['sigma2']
+
+            if index == arm_index:
+                new_n = self.memory_multiplier * old_n + 1
+                new_xsum = self.memory_multiplier * old_xsum + observed_reward
+            else:
+                new_n = self.memory_multiplier * old_n
+                new_xsum = self.memory_multiplier * old_xsum
+
+            new_sigma2_posterior = 1 / (1 / sigma2_prior + new_n / sigma2)
+            new_mu_posterior = (mu_prior / sigma2_prior + new_xsum / sigma2) * new_sigma2_posterior
+
+            self.sufficient_statistics[index]['n'] = new_n
+            self.sufficient_statistics[index]['xsum'] = new_xsum
+            self.posterior_params[index]['mu'] = new_mu_posterior
+            self.posterior_params[index]['sigma2'] = new_sigma2_posterior
+
+    @property
+    def estimated_arm_means(self):
+        return np.array([params['mu'] for params in self.posterior_params])
+
+    @property
+    def estimated_arm_sds(self):
+        return np.array([self.sigma2 + params['sigma2']
+            for params in self.posterior_params])
+
+
+
diff --git a/thompson_sampling.py b/thompson_sampling_script.py
similarity index 90%
rename from thompson_sampling.py
rename to thompson_sampling_script.py
index 864f6f5..4108d31 100644
--- a/thompson_sampling.py
+++ b/thompson_sampling_script.py
@@ -22,7 +22,7 @@
     print('Prior params: ' + str(bernoulli_strategy.prior_params))
     print('Posterior params: ' + str(bernoulli_strategy.posterior_params))
     print('True means: ' + str([0.45, 0.55]))
-    print('Mean reward estimates: ' + str(bernoulli_strategy.mean_reward_estimates))
+    print('Arm mean reward estimates: ' + str(bernoulli_strategy.estimated_arm_means))
 
     # Gaussian
     gaussian_bandit = StaticBandit(arms=[GaussianArm(mu=95, sigma=30),
@@ -34,7 +34,7 @@
     print('Prior params: ' + str(gaussian_strategy.prior_params))
     print('Posterior params: ' + str(gaussian_strategy.posterior_params))
     print('True means: ' + str([95, 105]))
-    print('Mean reward estimates: ' + str(gaussian_strategy.mean_reward_estimates))
+    print('Arm mean reward estimates: ' + str(gaussian_strategy.estimated_arm_means))
 
     # Linear interpolation bandit with Gaussian errors
     dynamic_bandit = LinearInterpolationBandit(means=np.array([[5.0, 8.0], [10.0, 5.0]]),
@@ -43,7 +43,7 @@
                                                   sigma=20,
                                                   mu_prior=0, sigma_prior=200,
                                                   memory_multiplier=0.9)
-    dynamic_strategy.fit(iterations=1000, plot=True)
+    out = dynamic_strategy.fit(iterations=1000)
     print('Prior params: ' + str(dynamic_strategy.prior_params))
     print('Posterior params: ' + str(dynamic_strategy.posterior_params))
 

From c38873a4536719637e1356b21423431876a94b20 Mon Sep 17 00:00:00 2001
From: Christopher Aicher <aicherc@uw.edu>
Date: Tue, 14 Mar 2017 15:39:44 -0700
Subject: [PATCH 2/3] Added ThompsonSampling for Gaussian with unknown mean +
 variance

---
 strategy/thompson_sampling_strategy.py | 78 ++++++++++++++++++++++++--
 thompson_sampling_script.py            | 21 ++++++-
 2 files changed, 90 insertions(+), 9 deletions(-)

diff --git a/strategy/thompson_sampling_strategy.py b/strategy/thompson_sampling_strategy.py
index e5ff3db..b8235c3 100644
--- a/strategy/thompson_sampling_strategy.py
+++ b/strategy/thompson_sampling_strategy.py
@@ -20,8 +20,8 @@ class ThompsonSampling(Strategy):
     Attributes:
         num_arms (int)
         posterior_params (list of list): posterior parameters
-        estimated_arm_means (ndarray): posterior predictive mean
-        estimated_arm_sds (ndarray): posterior predictive standard deviation
+        estimated_arm_means (ndarray): posterior mean
+        estimated_arm_sds (ndarray): posterior standard deviation
     """
     def __init__(self, bandit, **kwargs):
         self.bandit = bandit
@@ -98,18 +98,21 @@ def estimated_arm_means(self):
     @property
     def estimated_arm_sds(self):
         sd = lambda params: np.sqrt(
-                    (params['alpha'] / (params['alpha'] + params['beta'])) *
-                    (1 - params['alpha'] / (params['alpha'] + params['beta']))
+                    params['alpha'] * params['beta'] /
+                    ( (params['alpha'] + params['beta'])**2 *
+                      (params['alpha'] + params['beta'] + 1) )
                 )
         return np.array([sd(params) for params in self.posterior_params])
 
 class ThompsonGaussianKnownSigma(ThompsonSampling):
     def __init__(self, bandit, sigma, mu_prior, sigma_prior,
-            memory_multiplier=0.9):
+            memory_multiplier=1.0):
         ThompsonSampling.__init__(self, bandit, mu=mu_prior,
                 sigma2=sigma_prior ** 2)
         self.sigma2 = sigma ** 2
         self.sufficient_statistics = [{'n': 0, 'xsum': 0} for _ in range(self.num_arms)]
+        if memory_multiplier <= 0 or memory_multiplier > 1:
+            raise ValueError("Memory Multiplier should be in (0,1]")
         self.memory_multiplier = memory_multiplier
 
     def _sample(self, mu, sigma2):
@@ -138,6 +141,7 @@ def _update_posterior(self, arm_index, observed_reward):
             self.sufficient_statistics[index]['xsum'] = new_xsum
             self.posterior_params[index]['mu'] = new_mu_posterior
             self.posterior_params[index]['sigma2'] = new_sigma2_posterior
+        return
 
     @property
     def estimated_arm_means(self):
@@ -145,7 +149,69 @@ def estimated_arm_means(self):
 
     @property
     def estimated_arm_sds(self):
-        return np.array([self.sigma2 + params['sigma2']
+        return np.array([params['sigma2'] for params in self.posterior_params])
+
+class ThompsonGaussian(ThompsonSampling):
+    def __init__(self, bandit, mu_prior, nu_prior, alpha_prior, beta_prior,
+            memory_multiplier=1.0):
+        ThompsonSampling.__init__(self, bandit,
+                mu=mu_prior, nu=nu_prior, alpha=alpha_prior, beta=beta_prior)
+        self.sufficient_statistics = {
+                'n': np.zeros(self.num_arms),
+                'xsum': np.zeros(self.num_arms),
+                'x2sum': np.zeros(self.num_arms),
+                }
+        if memory_multiplier <= 0 or memory_multiplier > 1:
+            raise ValueError("Memory Multiplier should be in (0,1]")
+        self.memory_multiplier = memory_multiplier
+
+    def _sample(self, mu, nu, alpha, beta):
+        sigma2 = scipy.stats.invgamma.rvs(a=alpha, scale=beta)
+        return np.random.normal(loc=mu, scale=np.sqrt(sigma2/nu))
+
+    def _update_posterior(self, arm_index, observed_reward):
+        # Update Sufficient Statistics of arm_index
+        self.sufficient_statistics['n'][arm_index] += 1
+        self.sufficient_statistics['xsum'][arm_index] += observed_reward
+        self.sufficient_statistics['x2sum'][arm_index] += observed_reward**2
+
+        # Memory Multiplier Decay
+        self.sufficient_statistics['n'] *= self.memory_multiplier
+        self.sufficient_statistics['xsum'] *= self.memory_multiplier
+        self.sufficient_statistics['x2sum'] *= self.memory_multiplier
+
+        # Update posterior params -> Complicated due to posterior_params
+        #                            (really should use pandas)
+        for index in range(self.num_arms):
+            n = self.sufficient_statistics['n'][index]
+            xsum = self.sufficient_statistics['xsum'][index]
+            x2sum = self.sufficient_statistics['x2sum'][index]
+            mu = self.prior_params[index]['mu']
+            nu = self.prior_params[index]['nu']
+            alpha = self.prior_params[index]['alpha']
+            beta = self.prior_params[index]['beta']
+
+            # Terminate early if no data to update
+            if n == 0:
+                continue
+
+            # Conjugate Prior Update
+            self.posterior_params[index]['mu'] = (mu * nu + xsum)/(nu + n)
+            self.posterior_params[index]['nu'] = nu + n
+            self.posterior_params[index]['alpha'] = alpha + n/2.0
+            self.posterior_params[index]['beta'] = \
+                    beta + 0.5*(x2sum - xsum**2/n) + \
+                    0.5*(nu*n)/(nu+n)*(xsum/n - mu)**2
+        return
+
+    @property
+    def estimated_arm_means(self):
+        return np.array([params['mu'] for params in self.posterior_params])
+
+    @property
+    def estimated_arm_sds(self):
+        return np.array([
+            np.sqrt(params['beta'] / (params['alpha']-1) / params['nu'])
             for params in self.posterior_params])
 
 
diff --git a/thompson_sampling_script.py b/thompson_sampling_script.py
index 4108d31..effcca1 100644
--- a/thompson_sampling_script.py
+++ b/thompson_sampling_script.py
@@ -5,10 +5,13 @@
 """
 
 import numpy as np
-
 from arm import BernoulliArm, GaussianArm
 from bandit import StaticBandit, LinearInterpolationBandit
-from strategy import ThompsonBernoulli, ThompsonGaussianKnownSigma
+from strategy import (
+        ThompsonBernoulli,
+        ThompsonGaussianKnownSigma,
+        ThompsonGaussian,
+        )
 
 if __name__ == '__main__':
     np.random.seed(0)
@@ -24,7 +27,7 @@
     print('True means: ' + str([0.45, 0.55]))
     print('Arm mean reward estimates: ' + str(bernoulli_strategy.estimated_arm_means))
 
-    # Gaussian
+    # Gaussian with Known Sigma
     gaussian_bandit = StaticBandit(arms=[GaussianArm(mu=95, sigma=30),
                                          GaussianArm(mu=105, sigma=30)])
     gaussian_strategy = ThompsonGaussianKnownSigma(bandit=gaussian_bandit,
@@ -36,6 +39,18 @@
     print('True means: ' + str([95, 105]))
     print('Arm mean reward estimates: ' + str(gaussian_strategy.estimated_arm_means))
 
+    # Gaussian
+    gaussian_bandit = StaticBandit(arms=[GaussianArm(mu=90, sigma=5),
+                                         GaussianArm(mu=110, sigma=5)])
+    gaussian_strategy = ThompsonGaussian(bandit=gaussian_bandit,
+                                         mu_prior=100.0, nu_prior=0.01,
+                                         alpha_prior=1.01, beta_prior=1.0)
+    out = gaussian_strategy.fit(iterations=500)
+    print('Prior params: ' + str(gaussian_strategy.prior_params))
+    print('Posterior params: ' + str(gaussian_strategy.posterior_params))
+    print('True means: ' + str([95, 105]))
+    print('Arm mean reward estimates: ' + str(gaussian_strategy.estimated_arm_means))
+
     # Linear interpolation bandit with Gaussian errors
     dynamic_bandit = LinearInterpolationBandit(means=np.array([[5.0, 8.0], [10.0, 5.0]]),
                                                periods=[200, 200])

From f4201f676888d72c30358856b55764bf698cb2e4 Mon Sep 17 00:00:00 2001
From: Christopher Aicher <aicherc@uw.edu>
Date: Tue, 14 Mar 2017 20:04:43 -0700
Subject: [PATCH 3/3] Changed `pull_arm` to private `_pull_arm` and `num_arms`
 to property

---
 strategy/thompson_sampling_strategy.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/strategy/thompson_sampling_strategy.py b/strategy/thompson_sampling_strategy.py
index b8235c3..052d89b 100644
--- a/strategy/thompson_sampling_strategy.py
+++ b/strategy/thompson_sampling_strategy.py
@@ -25,7 +25,6 @@ class ThompsonSampling(Strategy):
     """
     def __init__(self, bandit, **kwargs):
         self.bandit = bandit
-        self.num_arms = bandit.num_arms
         self.prior_params = [deepcopy(kwargs) for _ in range(self.num_arms)]
         self.posterior_params = deepcopy(self.prior_params)
 
@@ -39,7 +38,7 @@ def fit(self, iterations, restart=False):
         estimated_arm_sds = [None] * iterations
         for i in range(iterations):
             index_arms_pulled[i] = self._choose_arm()
-            observed_rewards[i] = self.pull_arm(index_arms_pulled[i])
+            observed_rewards[i] = self._pull_arm(index_arms_pulled[i])
             self._update_posterior(index_arms_pulled[i], observed_rewards[i])
             estimated_arm_means[i] = self.estimated_arm_means
             estimated_arm_sds[i] = self.estimated_arm_sds
@@ -59,7 +58,7 @@ def _choose_arm(self):
         samples = [self._sample(**params) for params in self.posterior_params]
         return np.argmax(samples)
 
-    def pull_arm(self, arm_index):
+    def _pull_arm(self, arm_index):
         return self.bandit.pull_arm(arm_index)
 
     def _sample(self, **kwargs):
@@ -68,6 +67,10 @@ def _sample(self, **kwargs):
     def _update_posterior(self, arm_index, observed_reward):
         raise NotImplementedError
 
+    @property
+    def num_arms(self):
+        return bandit.num_arms
+
     @property
     def estimated_arm_means(self):
         raise NotImplementedError