From 18075c14812339003c96d9d4420950e7fd6f6e16 Mon Sep 17 00:00:00 2001 From: Christopher Aicher Date: Tue, 14 Mar 2017 14:22:38 -0700 Subject: [PATCH 1/3] Moved `strategy` into subfolder and added `estimated_arm_sds` --- strategy/__init__.py | 2 + strategy.py => strategy/strategy.py | 116 +------------ strategy/thompson_sampling_strategy.py | 152 ++++++++++++++++++ ...sampling.py => thompson_sampling_script.py | 6 +- 4 files changed, 158 insertions(+), 118 deletions(-) create mode 100644 strategy/__init__.py rename strategy.py => strategy/strategy.py (62%) create mode 100644 strategy/thompson_sampling_strategy.py rename thompson_sampling.py => thompson_sampling_script.py (90%) diff --git a/strategy/__init__.py b/strategy/__init__.py new file mode 100644 index 0000000..3e3c555 --- /dev/null +++ b/strategy/__init__.py @@ -0,0 +1,2 @@ +from strategy.strategy import * +from strategy.thompson_sampling_strategy import * diff --git a/strategy.py b/strategy/strategy.py similarity index 62% rename from strategy.py rename to strategy/strategy.py index 2362fb8..0402f3c 100644 --- a/strategy.py +++ b/strategy/strategy.py @@ -1,12 +1,12 @@ """ +Base Strategy Class """ import numpy as np import scipy.stats from copy import deepcopy -import matplotlib.pyplot as plt class Strategy: """ Base Strategy Class @@ -36,120 +36,6 @@ def fit(self, iterations, **kwargs): raise NotImplementedError -class ThompsonSampling(Strategy): - def __init__(self, bandit, **kwargs): - self.bandit = bandit - self.num_arms = bandit.num_arms - self.prior_params = [deepcopy(kwargs) for _ in range(self.num_arms)] - self.posterior_params = deepcopy(self.prior_params) - - def fit(self, iterations, restart=False, plot=False): - if restart == True: - self._restart() - - index_arms_pulled = [None] * iterations - observed_rewards = [None] * iterations - mean_reward_estimates = [None] * iterations - for i in range(iterations): - index_arms_pulled[i] = self._choose_arm() - observed_rewards[i] = self.pull_arm(index_arms_pulled[i]) - self._update_posterior(index_arms_pulled[i], observed_rewards[i]) - mean_reward_estimates[i] = self.mean_reward_estimates - - if plot == True: - plt.close() - plt.plot(mean_reward_estimates) - plt.show() - - out = { - 'rewards': observed_rewards, - 'arms_pulled': index_arms_pulled, - 'estimated_arm_means': mean_reward_estimates - } - return out - - def _restart(self): - self.posterior_params = deepcopy(self.prior_params) - - def _choose_arm(self): - samples = [self._sample(**params) for params in self.posterior_params] - return np.argmax(samples) - - def pull_arm(self, arm_index): - return self.bandit.pull_arm(arm_index) - - def _sample(self, **kwargs): - raise NotImplementedError - - def _update_posterior(self, arm_index, observed_reward): - raise NotImplementedError - - @property - def mean_reward_estimates(self): - raise NotImplementedError - - -class ThompsonBernoulli(ThompsonSampling): - """asdf""" - - def __init__(self, bandit, alpha_prior, beta_prior): - ThompsonSampling.__init__(self, bandit, alpha=alpha_prior, beta=beta_prior) - - def _sample(self, alpha, beta): - return scipy.stats.beta.rvs(alpha, beta, size=1) - - def _update_posterior(self, arm_index, observed_reward): - if observed_reward == 1: - self.posterior_params[arm_index]['alpha'] += 1 - else: - self.posterior_params[arm_index]['beta'] += 1 - - @property - def mean_reward_estimates(self): - return [params['alpha'] / (params['alpha'] + params['beta']) for params - in self.posterior_params] - - -class ThompsonGaussianKnownSigma(ThompsonSampling): - """asdf """ - - def __init__(self, bandit, sigma, mu_prior, sigma_prior, memory_multiplier=0.9): - ThompsonSampling.__init__(self, bandit, mu=mu_prior, sigma2=sigma_prior ** 2) - self.sigma2 = sigma ** 2 - self.sufficient_statistics = [{'n': 0, 'xsum': 0} for _ in range(self.num_arms)] - self.memory_multiplier = memory_multiplier - - def _sample(self, mu, sigma2): - return np.random.normal(loc=mu, scale=np.sqrt(sigma2)) - - def _update_posterior(self, arm_index, observed_reward): - sigma2 = self.sigma2 - - for index in range(self.num_arms): - old_n = self.sufficient_statistics[index]['n'] - old_xsum = self.sufficient_statistics[index]['xsum'] - mu_prior = self.prior_params[index]['mu'] - sigma2_prior = self.prior_params[index]['sigma2'] - - if index == arm_index: - new_n = self.memory_multiplier * old_n + 1 - new_xsum = self.memory_multiplier * old_xsum + observed_reward - else: - new_n = self.memory_multiplier * old_n - new_xsum = self.memory_multiplier * old_xsum - - new_sigma2_posterior = 1 / (1 / sigma2_prior + new_n / sigma2) - new_mu_posterior = (mu_prior / sigma2_prior + new_xsum / sigma2) * new_sigma2_posterior - - self.sufficient_statistics[index]['n'] = new_n - self.sufficient_statistics[index]['xsum'] = new_xsum - self.posterior_params[index]['mu'] = new_mu_posterior - self.posterior_params[index]['sigma2'] = new_sigma2_posterior - - @property - def mean_reward_estimates(self): - return [params['mu'] for params in self.posterior_params] - class EpsilonGreedy(Strategy): """ Epislon Greedy Strategy diff --git a/strategy/thompson_sampling_strategy.py b/strategy/thompson_sampling_strategy.py new file mode 100644 index 0000000..e5ff3db --- /dev/null +++ b/strategy/thompson_sampling_strategy.py @@ -0,0 +1,152 @@ +""" + +ThompsonSampling Strategy + +""" + +import numpy as np +import scipy.stats +from copy import deepcopy +from strategy.strategy import Strategy + + +class ThompsonSampling(Strategy): + """ Thompson Sampling Strategy + + Args: + bandit (Bandit): bandit + **kwargs : prior parameters + + Attributes: + num_arms (int) + posterior_params (list of list): posterior parameters + estimated_arm_means (ndarray): posterior predictive mean + estimated_arm_sds (ndarray): posterior predictive standard deviation + """ + def __init__(self, bandit, **kwargs): + self.bandit = bandit + self.num_arms = bandit.num_arms + self.prior_params = [deepcopy(kwargs) for _ in range(self.num_arms)] + self.posterior_params = deepcopy(self.prior_params) + + def fit(self, iterations, restart=False): + if restart == True: + self._restart() + + index_arms_pulled = [None] * iterations + observed_rewards = [None] * iterations + estimated_arm_means = [None] * iterations + estimated_arm_sds = [None] * iterations + for i in range(iterations): + index_arms_pulled[i] = self._choose_arm() + observed_rewards[i] = self.pull_arm(index_arms_pulled[i]) + self._update_posterior(index_arms_pulled[i], observed_rewards[i]) + estimated_arm_means[i] = self.estimated_arm_means + estimated_arm_sds[i] = self.estimated_arm_sds + + out = { + 'rewards': np.array(observed_rewards), + 'arms_pulled': np.array(index_arms_pulled), + 'estimated_arm_means': np.array(estimated_arm_means), + 'estimated_arm_sds': np.array(estimated_arm_sds) + } + return out + + def _restart(self): + self.posterior_params = deepcopy(self.prior_params) + + def _choose_arm(self): + samples = [self._sample(**params) for params in self.posterior_params] + return np.argmax(samples) + + def pull_arm(self, arm_index): + return self.bandit.pull_arm(arm_index) + + def _sample(self, **kwargs): + raise NotImplementedError + + def _update_posterior(self, arm_index, observed_reward): + raise NotImplementedError + + @property + def estimated_arm_means(self): + raise NotImplementedError + + @property + def estimated_arm_sds(self): + raise NotImplementedError + + +class ThompsonBernoulli(ThompsonSampling): + def __init__(self, bandit, alpha_prior, beta_prior): + ThompsonSampling.__init__(self, bandit, alpha=alpha_prior, beta=beta_prior) + + def _sample(self, alpha, beta): + return scipy.stats.beta.rvs(alpha, beta, size=1) + + def _update_posterior(self, arm_index, observed_reward): + if observed_reward == 1: + self.posterior_params[arm_index]['alpha'] += 1 + else: + self.posterior_params[arm_index]['beta'] += 1 + + @property + def estimated_arm_means(self): + mean = lambda params: \ + params['alpha'] / (params['alpha'] + params['beta']) + return np.array([ mean(params) for params in self.posterior_params]) + @property + def estimated_arm_sds(self): + sd = lambda params: np.sqrt( + (params['alpha'] / (params['alpha'] + params['beta'])) * + (1 - params['alpha'] / (params['alpha'] + params['beta'])) + ) + return np.array([sd(params) for params in self.posterior_params]) + +class ThompsonGaussianKnownSigma(ThompsonSampling): + def __init__(self, bandit, sigma, mu_prior, sigma_prior, + memory_multiplier=0.9): + ThompsonSampling.__init__(self, bandit, mu=mu_prior, + sigma2=sigma_prior ** 2) + self.sigma2 = sigma ** 2 + self.sufficient_statistics = [{'n': 0, 'xsum': 0} for _ in range(self.num_arms)] + self.memory_multiplier = memory_multiplier + + def _sample(self, mu, sigma2): + return np.random.normal(loc=mu, scale=np.sqrt(sigma2)) + + def _update_posterior(self, arm_index, observed_reward): + sigma2 = self.sigma2 + + for index in range(self.num_arms): + old_n = self.sufficient_statistics[index]['n'] + old_xsum = self.sufficient_statistics[index]['xsum'] + mu_prior = self.prior_params[index]['mu'] + sigma2_prior = self.prior_params[index]['sigma2'] + + if index == arm_index: + new_n = self.memory_multiplier * old_n + 1 + new_xsum = self.memory_multiplier * old_xsum + observed_reward + else: + new_n = self.memory_multiplier * old_n + new_xsum = self.memory_multiplier * old_xsum + + new_sigma2_posterior = 1 / (1 / sigma2_prior + new_n / sigma2) + new_mu_posterior = (mu_prior / sigma2_prior + new_xsum / sigma2) * new_sigma2_posterior + + self.sufficient_statistics[index]['n'] = new_n + self.sufficient_statistics[index]['xsum'] = new_xsum + self.posterior_params[index]['mu'] = new_mu_posterior + self.posterior_params[index]['sigma2'] = new_sigma2_posterior + + @property + def estimated_arm_means(self): + return np.array([params['mu'] for params in self.posterior_params]) + + @property + def estimated_arm_sds(self): + return np.array([self.sigma2 + params['sigma2'] + for params in self.posterior_params]) + + + diff --git a/thompson_sampling.py b/thompson_sampling_script.py similarity index 90% rename from thompson_sampling.py rename to thompson_sampling_script.py index 864f6f5..4108d31 100644 --- a/thompson_sampling.py +++ b/thompson_sampling_script.py @@ -22,7 +22,7 @@ print('Prior params: ' + str(bernoulli_strategy.prior_params)) print('Posterior params: ' + str(bernoulli_strategy.posterior_params)) print('True means: ' + str([0.45, 0.55])) - print('Mean reward estimates: ' + str(bernoulli_strategy.mean_reward_estimates)) + print('Arm mean reward estimates: ' + str(bernoulli_strategy.estimated_arm_means)) # Gaussian gaussian_bandit = StaticBandit(arms=[GaussianArm(mu=95, sigma=30), @@ -34,7 +34,7 @@ print('Prior params: ' + str(gaussian_strategy.prior_params)) print('Posterior params: ' + str(gaussian_strategy.posterior_params)) print('True means: ' + str([95, 105])) - print('Mean reward estimates: ' + str(gaussian_strategy.mean_reward_estimates)) + print('Arm mean reward estimates: ' + str(gaussian_strategy.estimated_arm_means)) # Linear interpolation bandit with Gaussian errors dynamic_bandit = LinearInterpolationBandit(means=np.array([[5.0, 8.0], [10.0, 5.0]]), @@ -43,7 +43,7 @@ sigma=20, mu_prior=0, sigma_prior=200, memory_multiplier=0.9) - dynamic_strategy.fit(iterations=1000, plot=True) + out = dynamic_strategy.fit(iterations=1000) print('Prior params: ' + str(dynamic_strategy.prior_params)) print('Posterior params: ' + str(dynamic_strategy.posterior_params)) From c38873a4536719637e1356b21423431876a94b20 Mon Sep 17 00:00:00 2001 From: Christopher Aicher Date: Tue, 14 Mar 2017 15:39:44 -0700 Subject: [PATCH 2/3] Added ThompsonSampling for Gaussian with unknown mean + variance --- strategy/thompson_sampling_strategy.py | 78 ++++++++++++++++++++++++-- thompson_sampling_script.py | 21 ++++++- 2 files changed, 90 insertions(+), 9 deletions(-) diff --git a/strategy/thompson_sampling_strategy.py b/strategy/thompson_sampling_strategy.py index e5ff3db..b8235c3 100644 --- a/strategy/thompson_sampling_strategy.py +++ b/strategy/thompson_sampling_strategy.py @@ -20,8 +20,8 @@ class ThompsonSampling(Strategy): Attributes: num_arms (int) posterior_params (list of list): posterior parameters - estimated_arm_means (ndarray): posterior predictive mean - estimated_arm_sds (ndarray): posterior predictive standard deviation + estimated_arm_means (ndarray): posterior mean + estimated_arm_sds (ndarray): posterior standard deviation """ def __init__(self, bandit, **kwargs): self.bandit = bandit @@ -98,18 +98,21 @@ def estimated_arm_means(self): @property def estimated_arm_sds(self): sd = lambda params: np.sqrt( - (params['alpha'] / (params['alpha'] + params['beta'])) * - (1 - params['alpha'] / (params['alpha'] + params['beta'])) + params['alpha'] * params['beta'] / + ( (params['alpha'] + params['beta'])**2 * + (params['alpha'] + params['beta'] + 1) ) ) return np.array([sd(params) for params in self.posterior_params]) class ThompsonGaussianKnownSigma(ThompsonSampling): def __init__(self, bandit, sigma, mu_prior, sigma_prior, - memory_multiplier=0.9): + memory_multiplier=1.0): ThompsonSampling.__init__(self, bandit, mu=mu_prior, sigma2=sigma_prior ** 2) self.sigma2 = sigma ** 2 self.sufficient_statistics = [{'n': 0, 'xsum': 0} for _ in range(self.num_arms)] + if memory_multiplier <= 0 or memory_multiplier > 1: + raise ValueError("Memory Multiplier should be in (0,1]") self.memory_multiplier = memory_multiplier def _sample(self, mu, sigma2): @@ -138,6 +141,7 @@ def _update_posterior(self, arm_index, observed_reward): self.sufficient_statistics[index]['xsum'] = new_xsum self.posterior_params[index]['mu'] = new_mu_posterior self.posterior_params[index]['sigma2'] = new_sigma2_posterior + return @property def estimated_arm_means(self): @@ -145,7 +149,69 @@ def estimated_arm_means(self): @property def estimated_arm_sds(self): - return np.array([self.sigma2 + params['sigma2'] + return np.array([params['sigma2'] for params in self.posterior_params]) + +class ThompsonGaussian(ThompsonSampling): + def __init__(self, bandit, mu_prior, nu_prior, alpha_prior, beta_prior, + memory_multiplier=1.0): + ThompsonSampling.__init__(self, bandit, + mu=mu_prior, nu=nu_prior, alpha=alpha_prior, beta=beta_prior) + self.sufficient_statistics = { + 'n': np.zeros(self.num_arms), + 'xsum': np.zeros(self.num_arms), + 'x2sum': np.zeros(self.num_arms), + } + if memory_multiplier <= 0 or memory_multiplier > 1: + raise ValueError("Memory Multiplier should be in (0,1]") + self.memory_multiplier = memory_multiplier + + def _sample(self, mu, nu, alpha, beta): + sigma2 = scipy.stats.invgamma.rvs(a=alpha, scale=beta) + return np.random.normal(loc=mu, scale=np.sqrt(sigma2/nu)) + + def _update_posterior(self, arm_index, observed_reward): + # Update Sufficient Statistics of arm_index + self.sufficient_statistics['n'][arm_index] += 1 + self.sufficient_statistics['xsum'][arm_index] += observed_reward + self.sufficient_statistics['x2sum'][arm_index] += observed_reward**2 + + # Memory Multiplier Decay + self.sufficient_statistics['n'] *= self.memory_multiplier + self.sufficient_statistics['xsum'] *= self.memory_multiplier + self.sufficient_statistics['x2sum'] *= self.memory_multiplier + + # Update posterior params -> Complicated due to posterior_params + # (really should use pandas) + for index in range(self.num_arms): + n = self.sufficient_statistics['n'][index] + xsum = self.sufficient_statistics['xsum'][index] + x2sum = self.sufficient_statistics['x2sum'][index] + mu = self.prior_params[index]['mu'] + nu = self.prior_params[index]['nu'] + alpha = self.prior_params[index]['alpha'] + beta = self.prior_params[index]['beta'] + + # Terminate early if no data to update + if n == 0: + continue + + # Conjugate Prior Update + self.posterior_params[index]['mu'] = (mu * nu + xsum)/(nu + n) + self.posterior_params[index]['nu'] = nu + n + self.posterior_params[index]['alpha'] = alpha + n/2.0 + self.posterior_params[index]['beta'] = \ + beta + 0.5*(x2sum - xsum**2/n) + \ + 0.5*(nu*n)/(nu+n)*(xsum/n - mu)**2 + return + + @property + def estimated_arm_means(self): + return np.array([params['mu'] for params in self.posterior_params]) + + @property + def estimated_arm_sds(self): + return np.array([ + np.sqrt(params['beta'] / (params['alpha']-1) / params['nu']) for params in self.posterior_params]) diff --git a/thompson_sampling_script.py b/thompson_sampling_script.py index 4108d31..effcca1 100644 --- a/thompson_sampling_script.py +++ b/thompson_sampling_script.py @@ -5,10 +5,13 @@ """ import numpy as np - from arm import BernoulliArm, GaussianArm from bandit import StaticBandit, LinearInterpolationBandit -from strategy import ThompsonBernoulli, ThompsonGaussianKnownSigma +from strategy import ( + ThompsonBernoulli, + ThompsonGaussianKnownSigma, + ThompsonGaussian, + ) if __name__ == '__main__': np.random.seed(0) @@ -24,7 +27,7 @@ print('True means: ' + str([0.45, 0.55])) print('Arm mean reward estimates: ' + str(bernoulli_strategy.estimated_arm_means)) - # Gaussian + # Gaussian with Known Sigma gaussian_bandit = StaticBandit(arms=[GaussianArm(mu=95, sigma=30), GaussianArm(mu=105, sigma=30)]) gaussian_strategy = ThompsonGaussianKnownSigma(bandit=gaussian_bandit, @@ -36,6 +39,18 @@ print('True means: ' + str([95, 105])) print('Arm mean reward estimates: ' + str(gaussian_strategy.estimated_arm_means)) + # Gaussian + gaussian_bandit = StaticBandit(arms=[GaussianArm(mu=90, sigma=5), + GaussianArm(mu=110, sigma=5)]) + gaussian_strategy = ThompsonGaussian(bandit=gaussian_bandit, + mu_prior=100.0, nu_prior=0.01, + alpha_prior=1.01, beta_prior=1.0) + out = gaussian_strategy.fit(iterations=500) + print('Prior params: ' + str(gaussian_strategy.prior_params)) + print('Posterior params: ' + str(gaussian_strategy.posterior_params)) + print('True means: ' + str([95, 105])) + print('Arm mean reward estimates: ' + str(gaussian_strategy.estimated_arm_means)) + # Linear interpolation bandit with Gaussian errors dynamic_bandit = LinearInterpolationBandit(means=np.array([[5.0, 8.0], [10.0, 5.0]]), periods=[200, 200]) From f4201f676888d72c30358856b55764bf698cb2e4 Mon Sep 17 00:00:00 2001 From: Christopher Aicher Date: Tue, 14 Mar 2017 20:04:43 -0700 Subject: [PATCH 3/3] Changed `pull_arm` to private `_pull_arm` and `num_arms` to property --- strategy/thompson_sampling_strategy.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/strategy/thompson_sampling_strategy.py b/strategy/thompson_sampling_strategy.py index b8235c3..052d89b 100644 --- a/strategy/thompson_sampling_strategy.py +++ b/strategy/thompson_sampling_strategy.py @@ -25,7 +25,6 @@ class ThompsonSampling(Strategy): """ def __init__(self, bandit, **kwargs): self.bandit = bandit - self.num_arms = bandit.num_arms self.prior_params = [deepcopy(kwargs) for _ in range(self.num_arms)] self.posterior_params = deepcopy(self.prior_params) @@ -39,7 +38,7 @@ def fit(self, iterations, restart=False): estimated_arm_sds = [None] * iterations for i in range(iterations): index_arms_pulled[i] = self._choose_arm() - observed_rewards[i] = self.pull_arm(index_arms_pulled[i]) + observed_rewards[i] = self._pull_arm(index_arms_pulled[i]) self._update_posterior(index_arms_pulled[i], observed_rewards[i]) estimated_arm_means[i] = self.estimated_arm_means estimated_arm_sds[i] = self.estimated_arm_sds @@ -59,7 +58,7 @@ def _choose_arm(self): samples = [self._sample(**params) for params in self.posterior_params] return np.argmax(samples) - def pull_arm(self, arm_index): + def _pull_arm(self, arm_index): return self.bandit.pull_arm(arm_index) def _sample(self, **kwargs): @@ -68,6 +67,10 @@ def _sample(self, **kwargs): def _update_posterior(self, arm_index, observed_reward): raise NotImplementedError + @property + def num_arms(self): + return bandit.num_arms + @property def estimated_arm_means(self): raise NotImplementedError