From 951dbc30dd7f6841570187a78acf189aff6f5e62 Mon Sep 17 00:00:00 2001 From: marynel Date: Wed, 17 Feb 2016 13:36:58 -0500 Subject: [PATCH 1/8] fixed bug with policy_gradient and skeleton --- pyrl/agents/policy_gradient.py | 6 +++--- pyrl/agents/skeleton_agent.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pyrl/agents/policy_gradient.py b/pyrl/agents/policy_gradient.py index 4c51c3b..064750e 100644 --- a/pyrl/agents/policy_gradient.py +++ b/pyrl/agents/policy_gradient.py @@ -113,14 +113,14 @@ class REINFORCE(policy_gradient): name = "REINFORCE" def agent_init(self,taskSpec): - super(REINFORCE, self).agent_init(self,taskSpec) + super(REINFORCE, self).agent_init(taskSpec) self.baseline_numerator = numpy.zeros(self.weights.shape) self.baseline_denom = numpy.zeros(self.weights.shape) self.gradient_estimate = numpy.zeros(self.weights.shape) self.ep_count = 0 def init_parameters(self): - super(REINFORCE, self).init_parameters(self) + super(REINFORCE, self).init_parameters() self.num_rollouts = self.params.setdefault('num_rollouts', 5) @classmethod @@ -145,7 +145,7 @@ def agent_start(self,observation): self.ep_count += 1 self.Return = 0.0 - return super(REINFORCE, self).agent_start(self, observation) + return super(REINFORCE, self).agent_start(observation) def update(self, phi_t, phi_tp, reward, compatFeatures): self.traces += compatFeatures diff --git a/pyrl/agents/skeleton_agent.py b/pyrl/agents/skeleton_agent.py index 3dc9950..f0acf27 100644 --- a/pyrl/agents/skeleton_agent.py +++ b/pyrl/agents/skeleton_agent.py @@ -130,7 +130,7 @@ def agent_message(self,inMessage): if inMessage.lower() == "agent_diverged?": return str(self.has_diverged()) else: - return name + " does not understand your message." + return self.name + " does not understand your message." def has_diverged(self): """Overwrite the function with one that checks the key values for your From e3be60824ef1ca54446805b2d708ec55424f71d6 Mon Sep 17 00:00:00 2001 From: marynel Date: Wed, 17 Feb 2016 15:54:29 -0500 Subject: [PATCH 2/8] Added documentation --- pyrl/agents/README.md | 153 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 153 insertions(+) diff --git a/pyrl/agents/README.md b/pyrl/agents/README.md index 442f1da..f632f57 100644 --- a/pyrl/agents/README.md +++ b/pyrl/agents/README.md @@ -2,3 +2,156 @@ pyrl.agents ========= Reinforcement Learning agents that have been implemented in python using the RLGlue framework. + +The following sections describe the algorithms that are implemented in the library and provide some useful references. Different basis can be used with linear function approximators (check the specific options of the agents). + +--- +### skeleton\_agnet.py +Base class for the agents. Do not use directly (picks random actions all the time). + +--- +### sarsa\_lambda\_ann.py +Implementation of SARSA (with eligibility traces) and neural networks for approximate Q estimation. The main reference for this algorithm is: + +Rummery G. and Niranjan, M. (1994). _On-line q-learning using connectionist systems_. Technical Report CUED/F-INFENG/TR 166, Cambridge University, Engineering Department. + +##### REQUIREMENTS +This agent is meant to work with continuous-state/discrete-action domains. + +##### NOTES +This method uses a single network to estimate Q. The number of inputs in the network is the size of the feature space; the number of outputs is the number of possible discrete actions. In contrast, the original paper proposed to use a *single* network per action. + +The original paper also suggests to decrease the exploration rate as the agent learns more from the environment. It looks like this implementation rather has a constant exploration rate (epsilon). + +--- +### sarsa\_lambda.py +Implementation of SARSA (with eligibility traces) and a linear approximator for Q. + +##### REQUIREMENTS +This agent is meant to work with continuous-state/discrete-action domains. + +##### NOTES +This agent is similar to sarsa\_lambda\_ann.py but with a simpler approximator for Q. + +--- +### qlearning.py +Implementation of Q-Learning with a linear function approximator. Different to SARSA, Q learning uses the best action found so far to update Q's estimate and acts greedily with respect to its estimates. + +The main reference for Q-Learning is: + +C. J. Watkins. [_Learning from Delayed Rewards_](https://www.cs.rhul.ac.uk/home/chrisw/new_thesis.pdf). Phd thesis, Cambridge University, 1989. + +A description of Q-Learning with linear function approximation can be found in: + +Francisco S. Melo and M. Isabel Ribeiro, [_Q-learning with linear function approximation_](http://gaips.inesc-id.pt/~fmelo/pub/melo07tr-b.pdf). Technical Report, RT-602-07, Instituto de Sistemas e Robótica, Pólo de Lisboa. + +##### REQUIREMENTS +This agent is meant to work with continuous-state/discrete-action domains. + +##### NOTES +This agent inherits sarsa\_lambda.py and re-implements the agent_step and update functions. + +--- +### delayed_qlearning.py +Implementation of [_PAC Model-Free Reinforcement Learning_](http://www.autonlab.org/icml_documents/camera-ready/111_PAC_Model_free_Reinf.pdf) by Alexander Strehl, Lihong Li, Eric Wiewiora, John Langford, and Michael Littman (2006). + +The standard Q-Learning agent changes its Q-value estimates on every time step. Rather, Delayed Q-Learning waits for _m_ sample updates to make any changes (_m_ is a parameter of the algorithm). According to the above paper, _"this variation has an averaging effect that mitigates some of the effects of randomness"_ and makes it optimistic. _"Since the action-selection strategy is greedy, the Delayed Q-Learning agent will tend to choose overly optimistic actions, therefore achieving direct exploration when necessary"_. + + +##### REQUIREMENTS +This agent is meant to work with discrete-state/discrete-action domains. + +##### NOTES +There might be a bug in this implementation. The code indicates that: _"Unfortunately, I have no yet been able to get this to work consistently on the marble maze domain. It seems likely that it would work on something simpler like chain domain. Maybe there's a bug?"_. + +--- +### lstd.py +Implements Least Squares Temporal Difference Learning (LSTD). The main reference for this agent is: + +Michail Lagoudakis and Ronald Parr, _Least-Squares Policy Iteration_. Journal of Machine Learning Research, v. 4, 2003. + +##### REQUIREMENTS +This agent is meant to work with continuous-state/discrete-action domains. + +##### NOTES +The code says: _"This is actually very nearly an implementation of LSTD-Q. The only difference with the paper, is that the code does not store the samples themselves, and instead stores A and b. This means that it can't reuse samples as effectively when the policy changes"_. + +The implementation inherits sarsa\_lambda.py. + +--- +### modelbased.py +Implements an agent that learns from the environment (e.g., using linear regression, a super vector machine, or a random forest) and plans using Fitted Q Iteration. The main reference for the planner is: + +Damien Ernst, Pierre Geurts and Louis Wehenkel, [_Tree-Based Batch Mode Reinforcement Learning_](http://www.jmlr.org/papers/volume6/ernst05a/ernst05a.pdf). Journal of Machine Learning Research, v.6, 2005. + +##### REQUIREMENTS +This agent is meant to work with continuous-state/discrete-action domains. + +##### NOTES +This implementation supports using a variety of basis functions to represent the agent observations in a different space before passing them to the model learners. + +The planner takes care of passing data to the model learner. + +--- +### mirror\_descent.py +Implements [_Sparse Q-Learning with Mirror Descent_](http://www.auai.org/uai2012/papers/261.pdf) by Sridhar Mahadevan and Bo Liu, 2012. This is a _proximal-gradient_ based temporal-difference (TD) algorithm that uses a p-norm distance generating function. + +##### REQUIREMENTS +This agent is meant to work with continuous-state/discrete-action domains. + +##### NOTES +This agent inherits qlearning.py. + +--- +### policy\_gradient.py (REINFORCE) +Implements the [REINFORCE](http://www-anw.cs.umass.edu/~barto/courses/cs687/williams92simple.pdf) algorihtm by Ronald Williams. + +##### REQUIREMENTS +This agent is meant to work with continuous-state/discrete-action domains. + +##### NOTES +This agent inherits the policy_gradient class in policy\_gradient.py, which in turn inherits sarsa\_lambda.py. + +Breaks with the Tetris environment. + +--- +### policy\_gradient.py (twotime\_ac) +Implements Regular-Gradient Actor-Critic. This is Algorithm 1 from [Natural Actor-Critic Algorithms](https://webdocs.cs.ualberta.ca/~sutton/papers/BSGL-TR.pdf) by Shalabh Bhatnagar, Richard S. Sutton, Mohammad Ghavamzadeh, and Mark Lee (2009). + +##### REQUIREMENTS +This agent is meant to work with continuous-state/discrete-action domains. + +##### NOTES +This agent inherits the policy_gradient class in policy\_gradient.py, which in turn inherits sarsa\_lambda.py. + +--- +### policy\_gradient.py (twotime\_nac) +Implements Natural-Gradient Actor-Critic with Advantage Parameters. This is Algorithm 3 from [Natural Actor-Critic Algorithms](https://webdocs.cs.ualberta.ca/~sutton/papers/BSGL-TR.pdf) by Shalabh Bhatnagar, Richard S. Sutton, Mohammad Ghavamzadeh, and Mark Lee (2009). + +##### REQUIREMENTS +This agent is meant to work with continuous-state/discrete-action domains. + +##### NOTES +This agent inherits the policy_gradient class in policy\_gradient.py, which in turn inherits sarsa\_lambda.py. + +--- +### policy\_gradient.py (nac_lstd) +Implements the [Natural Actor-Critic](https://homes.cs.washington.edu/~todorov/courses/amath579/reading/NaturalActorCritic.pdf) agent by Jan Peters and Stefan Schaal (2007). The actor updates are based on stochastic policy gradients (using Amari's natural gradient), while the critic obtains the natural gradient and additional parameters of the value function by linear regression. + +##### REQUIREMENTS +This agent is meant to work with continuous-state/discrete-action domains. + +##### NOTES +As the code indicates, this implementation _"deviates from the pseudo-code given in the paper because it uses the Sheman-Morrison formula to do incremental updates to the matrix inverse"_. + +This agent inherits the policy_gradient class in policy\_gradient.py, which in turn inherits sarsa\_lambda.py. + +--- +### policy\_gradient.py (nac_sarsa) +Implements the Natural Actor-Critic with SARSA(lambda) by Philip S. Thomas. This is algorithm 2 in his 2012 [Bias in Natural Actor-Critic Algorithms](http://psthomas.com/papers/Thomas2012b.pdf) paper. + +##### REQUIREMENTS +This agent is meant to work with continuous-state/discrete-action domains. + +##### NOTES +The code says: _"While fundamentally the same as twotime\_nac (Algorithm 3 of BSGL's paper), this implements NACS which uses a different form of the same update equations. The main difference is in this algorithm's avoidance of the average reward accumulator"_. From 91d5b1189d6e4aa8ded257945d447511753232cf Mon Sep 17 00:00:00 2001 From: marynel Date: Wed, 17 Feb 2016 16:06:17 -0500 Subject: [PATCH 3/8] Added documentation --- pyrl/agents/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyrl/agents/README.md b/pyrl/agents/README.md index f632f57..211f2cb 100644 --- a/pyrl/agents/README.md +++ b/pyrl/agents/README.md @@ -3,10 +3,10 @@ pyrl.agents Reinforcement Learning agents that have been implemented in python using the RLGlue framework. -The following sections describe the algorithms that are implemented in the library and provide some useful references. Different basis can be used with linear function approximators (check the specific options of the agents). +The following sections describe the algorithms that are implemented in the library and provide some useful references. Different bases can be used with linear function approximators (check the specific options of the agents). --- -### skeleton\_agnet.py +### skeleton\_agent.py Base class for the agents. Do not use directly (picks random actions all the time). --- From bc64bf5ac7c673d5b0989d09609ef6e208d24a19 Mon Sep 17 00:00:00 2001 From: Marynel Vazquez Date: Wed, 17 Feb 2016 21:18:52 -0500 Subject: [PATCH 4/8] added default value to the agent's help message --- pyrl/misc/parameter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyrl/misc/parameter.py b/pyrl/misc/parameter.py index 57bb5db..17c4b99 100644 --- a/pyrl/misc/parameter.py +++ b/pyrl/misc/parameter.py @@ -88,7 +88,7 @@ def add_parameter(parser, name, min=0., max=1.0, optimize=True, **kwargs): if kwargs['type'] in [int, float]: value_range = ValueRange(min, max, dtype=kwargs['type']) kwargs['choices'] = value_range - kwargs['metavar'] = str(min) + ".." + str(max) + kwargs['metavar'] = str(min) + ".." + str(max) + " (default: " + str(kwargs['default']) + ")" elif kwargs['type'] is not bool: raise TypeError("String typed parameter requires 'choices' argument") From 57e802991b5074134b9f02e82969c83a4b45a0ab Mon Sep 17 00:00:00 2001 From: marynel Date: Thu, 18 Feb 2016 17:56:29 -0500 Subject: [PATCH 5/8] added stuff to gitignore --- pyrl/basis/.gitignore | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyrl/basis/.gitignore b/pyrl/basis/.gitignore index 618b79b..0be3228 100644 --- a/pyrl/basis/.gitignore +++ b/pyrl/basis/.gitignore @@ -27,3 +27,8 @@ pip-log.txt #Mr Developer .mr.developer.cfg + +CMakeCache.txt +CMakeFiles +Makefile +cmake_install.cmake \ No newline at end of file From 75e76603fbd5a3a5486e8bc80a8cddf2439b3db9 Mon Sep 17 00:00:00 2001 From: Marynel Vazquez Date: Sat, 20 Feb 2016 11:31:05 -0500 Subject: [PATCH 6/8] fixed bug in stepsizes --- pyrl/agents/stepsizes.py | 6 +++++- pyrl/basis/CTiles/.gitignore | 6 ++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pyrl/agents/stepsizes.py b/pyrl/agents/stepsizes.py index 3f4e7ed..4b11c71 100644 --- a/pyrl/agents/stepsizes.py +++ b/pyrl/agents/stepsizes.py @@ -220,7 +220,11 @@ def init_stepsize(self, weights_shape, params): def rescale_update(self, phi_t, phi_tp, delta, reward, descent_direction): deltaPhi = (self.gamma * phi_tp - phi_t).flatten() denomTerm = numpy.dot(self.traces.flatten(), deltaPhi.flatten()) - self.alpha = numpy.min([self.alpha, 1.0/numpy.abs(denomTerm)]) + absDenomTerm = numpy.abs(denomTerm) + if absDenomTerm > 1e-6: + self.alpha = numpy.min([self.alpha, 1.0/numpy.abs(denomTerm)]) + else: + self.alpha = self.alpha self.step_sizes.fill(self.alpha) return self.step_sizes * descent_direction diff --git a/pyrl/basis/CTiles/.gitignore b/pyrl/basis/CTiles/.gitignore index 618b79b..86b07dd 100644 --- a/pyrl/basis/CTiles/.gitignore +++ b/pyrl/basis/CTiles/.gitignore @@ -27,3 +27,9 @@ pip-log.txt #Mr Developer .mr.developer.cfg + +# Compiled CTiles +CMakeCache.txt +CMakeFiles +Makefile +cmake_install.cmake \ No newline at end of file From b21aed9c3dcd061effa57288b703b38b0d841bbd Mon Sep 17 00:00:00 2001 From: Marynel Vazquez Date: Sun, 21 Feb 2016 16:33:48 -0500 Subject: [PATCH 7/8] Added support for pikling agents, randomized the output of getAction in sarsa_lambda in case there is a tie, and added support for arguments without parameters --- pyrl/agents/sarsa_lambda.py | 11 +++++++-- pyrl/agents/skeleton_agent.py | 44 +++++++++++++++++++++++++++++++++++ pyrl/misc/parameter.py | 2 ++ 3 files changed, 55 insertions(+), 2 deletions(-) diff --git a/pyrl/agents/sarsa_lambda.py b/pyrl/agents/sarsa_lambda.py index ac72c4a..88fe62e 100644 --- a/pyrl/agents/sarsa_lambda.py +++ b/pyrl/agents/sarsa_lambda.py @@ -143,9 +143,16 @@ def sample_softmax(self, state, discState): return numpy.where(Q >= numpy.random.random())[0][0] def egreedy(self, state, discState): + if self.randGenerator.random() < self.epsilon: - return self.randGenerator.randint(0,self.numActions-1) - return numpy.dot(self.weights[discState,:,:].T, self.basis.computeFeatures(state)).argmax() + selected_action = self.randGenerator.randint(0,self.numActions-1) + else: + Qapprox = numpy.dot(self.weights[discState,:,:].T, self.basis.computeFeatures(state)) + selected_action = Qapprox.argmax() + max_options = numpy.where(Qapprox == Qapprox[selected_action])[0].tolist() + if len(max_options) > 1: + selected_action = max_options[self.randGenerator.randint(0,len(max_options)-1)] + return selected_action def getDiscState(self, state): """Return the integer value representing the current discrete state. diff --git a/pyrl/agents/skeleton_agent.py b/pyrl/agents/skeleton_agent.py index f0acf27..78e743d 100644 --- a/pyrl/agents/skeleton_agent.py +++ b/pyrl/agents/skeleton_agent.py @@ -19,6 +19,8 @@ from pyrl.rlglue.registry import register_agent from pyrl.misc.parameter import * +import cPickle + @register_agent class skeleton_agent(Agent, object): name = "Skeleton agent" @@ -129,6 +131,18 @@ def agent_message(self,inMessage): """ if inMessage.lower() == "agent_diverged?": return str(self.has_diverged()) + elif len(inMessage) > 10 and inMessage.lower()[0:10] == "save_agent": + filename = inMessage.split()[1] + if self.saveAgent(filename) is True: + return "%s saved the agent state to '%s'" % (self.name,filename) + else: + return "ERROR: Could not save the agent to %s" % filename + elif len(inMessage) > 10 and inMessage.lower()[0:10] == "load_agent": + filename = inMessage.split()[1] + if self.saveAgent(filename) is True: + return "%s loaded the agent state from '%s'" % (self.name,filename) + else: + return "ERROR: Could not load the agent state from %s" % filename else: return self.name + " does not understand your message." @@ -140,6 +154,36 @@ def has_diverged(self): return False + def loadAgent(self, filename): + """Unpickle the agent + Args: + filename - file with pickled agent + """ + try: + f = open(filename,'rb') + tmp_dict = cPickle.load(f) + f.close() + self.__dict__.update(tmp_dict) + except IOError: + print "Failed to load agent from %s" % filename + return False + return True + + def saveAgent(self, filename): + """Pickle the agent + Args: + filename - filename of pickled agent + """ + try: + f = open(filename,'wb') + cPickle.dump(self.__dict__,f,2) + f.close() + except IOError: + print "Failed to save agent to %s" % filename + return False + return True + + def runAgent(agent_class): """Use the agent_parameters function to parse command line arguments and run the RL agent in network mode. diff --git a/pyrl/misc/parameter.py b/pyrl/misc/parameter.py index 17c4b99..b01c638 100644 --- a/pyrl/misc/parameter.py +++ b/pyrl/misc/parameter.py @@ -81,6 +81,8 @@ def add_parameter(parser, name, min=0., max=1.0, optimize=True, **kwargs): if kwargs.has_key('choices'): kwargs.setdefault('type', kwargs['choices'][0].__class__) + elif kwargs.has_key('action'): + pass else: # Otherwise, default to float kwargs.setdefault('type', float) From 265c0dd9e320217d90d122bd984ab3879a389d54 Mon Sep 17 00:00:00 2001 From: Marynel Vazquez Date: Mon, 29 Feb 2016 10:44:31 -0500 Subject: [PATCH 8/8] Added support for string params to load pickled agent --- pyrl/agents/sarsa_lambda.py | 6 ++++-- pyrl/agents/skeleton_agent.py | 3 ++- pyrl/misc/parameter.py | 7 ++++--- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pyrl/agents/sarsa_lambda.py b/pyrl/agents/sarsa_lambda.py index 88fe62e..892513a 100644 --- a/pyrl/agents/sarsa_lambda.py +++ b/pyrl/agents/sarsa_lambda.py @@ -28,7 +28,7 @@ def init_parameters(self): self.epsilon = self.params.setdefault('epsilon', 0.1) self.alpha = self.params.setdefault('alpha', 0.01) self.lmbda = self.params.setdefault('lmbda', 0.7) - self.gamma = self.params.setdefault('gamma', 1.0) + #self.gamma = self.params.setdefault('gamma', 1.0) use env discount factor self.fa_name = self.params.setdefault('basis', 'trivial') self.softmax = self.params.setdefault('softmax', False) self.basis = None @@ -38,7 +38,7 @@ def agent_parameters(cls): param_set = super(sarsa_lambda, cls).agent_parameters() add_parameter(param_set, "alpha", default=0.01, help="Step-size parameter") add_parameter(param_set, "epsilon", default=0.1, help="Exploration rate for epsilon-greedy, or rescaling factor for soft-max.") - add_parameter(param_set, "gamma", default=1.0, help="Discount factor") + # add_parameter(param_set, "gamma", default=1.0, help="Discount factor") add_parameter(param_set, "lmbda", default=0.7, help="Eligibility decay rate") # Parameters *NOT* used in parameter optimization @@ -81,6 +81,8 @@ def agent_init(self,taskSpec): print "Task Spec could not be parsed: "+taskSpecString; sys.exit(1) + self.gamma = TaskSpec.getDiscountFactor() + self.numStates=len(TaskSpec.getDoubleObservations()) self.discStates = numpy.array(TaskSpec.getIntObservations()) self.numDiscStates = int(reduce(lambda a, b: a * (b[1] - b[0] + 1), self.discStates, 1.0)) diff --git a/pyrl/agents/skeleton_agent.py b/pyrl/agents/skeleton_agent.py index 78e743d..fe40e36 100644 --- a/pyrl/agents/skeleton_agent.py +++ b/pyrl/agents/skeleton_agent.py @@ -162,7 +162,8 @@ def loadAgent(self, filename): try: f = open(filename,'rb') tmp_dict = cPickle.load(f) - f.close() + f.close() + print "Updating agent dictionary with the pickled data (%s)" % filename self.__dict__.update(tmp_dict) except IOError: print "Failed to load agent from %s" % filename diff --git a/pyrl/misc/parameter.py b/pyrl/misc/parameter.py index b01c638..5eb57f7 100644 --- a/pyrl/misc/parameter.py +++ b/pyrl/misc/parameter.py @@ -65,7 +65,8 @@ def sample_exprand(self, size=None): def parameter_set(alg_name, **kwargs): kwargs['prog'] = alg_name kwargs['conflict_handler'] = 'resolve' - kwargs['add_help'] = False + if not kwargs.has_key('add_help'): + kwargs['add_help'] = False parser = argparse.ArgumentParser(**kwargs) parser.add_argument_group(title="optimizable", description="Algorithm parameters that should/can be optimized. " + \ @@ -91,8 +92,8 @@ def add_parameter(parser, name, min=0., max=1.0, optimize=True, **kwargs): value_range = ValueRange(min, max, dtype=kwargs['type']) kwargs['choices'] = value_range kwargs['metavar'] = str(min) + ".." + str(max) + " (default: " + str(kwargs['default']) + ")" - elif kwargs['type'] is not bool: - raise TypeError("String typed parameter requires 'choices' argument") + # elif kwargs['type'] is not bool: + # raise TypeError("String typed parameter requires 'choices' argument") if optimize: i = map(lambda k: k.title, parser._action_groups).index("optimizable")