From c722ed3e379f53b73d7f5f6033864813d30d2675 Mon Sep 17 00:00:00 2001 From: Jake Coltman Date: Sat, 13 Jan 2018 18:44:50 +0000 Subject: [PATCH 01/15] Refine comment of performance --- .idea/workspace.xml | 70 ++++++++++++++++++++++++++------------------- README.md | 8 +----- 2 files changed, 41 insertions(+), 37 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 51b1ec8..0a1aeed 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -1,9 +1,9 @@ - + - + @@ -86,21 +86,11 @@ - - - - - - - - - - - + - + @@ -108,6 +98,19 @@ + + + + + + + + + + + + + @@ -200,7 +203,6 @@ @@ -761,7 +764,14 @@ \ No newline at end of file diff --git a/README.md b/README.md index e8aa4b4..181f961 100644 --- a/README.md +++ b/README.md @@ -17,10 +17,4 @@ Implementing this philosophy has a number of positive effects on the library: * There are no hand-offs to non-python objects * Models allow for substitution of any of their composite blocks -The trade-off to get these goods is performance. Models provide in the library are designed to be tweakable, which limits performance optimizations. This manifests itself in a number of ways: - - * Straight up crunching speed - * Memory useage - * Models often don't exploit conjugacy where it exists - -For very large data sets or very complicated models, you might be better off using something like Stan. \ No newline at end of file +In general, the trade-off to buy the above is speed and memory performance. Constructing models in a compositional, modifiable way often leads to not being able to get performance improvements. For medium sized data sets, this isn't a problem, but for very large data sets and complex models speed can be a problem. \ No newline at end of file From 927fa773519630ce2f8e24f340082e8d30377e33 Mon Sep 17 00:00:00 2001 From: Jake Coltman Date: Sun, 14 Jan 2018 09:19:04 +0000 Subject: [PATCH 02/15] Add a node class to encapsulate fixed data --- .idea/workspace.xml | 200 +++++++++++++++++--------------------- SurPyval/node/datanode.py | 20 ++++ 2 files changed, 107 insertions(+), 113 deletions(-) create mode 100644 SurPyval/node/datanode.py diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 0a1aeed..120f78b 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -2,8 +2,8 @@ + - @@ -31,11 +31,9 @@ - - - - - + + + @@ -43,7 +41,7 @@ - + @@ -53,8 +51,8 @@ - - + + @@ -65,8 +63,8 @@ - - + + @@ -77,7 +75,7 @@ - + @@ -89,8 +87,8 @@ - - + + @@ -98,24 +96,21 @@ - - - - - - - - - + + + + + + - + - - + + @@ -127,7 +122,7 @@ - + @@ -136,8 +131,8 @@ - - + + @@ -215,6 +210,7 @@ @@ -245,8 +241,6 @@ - - @@ -299,6 +293,8 @@ - - + @@ -839,7 +856,14 @@ @@ -897,7 +921,6 @@ - @@ -913,6 +936,7 @@ + @@ -931,7 +955,6 @@ \ No newline at end of file diff --git a/SurPyval/model/fitmodel.py b/SurPyval/model/fitmodel.py index 4b53c96..c8e5db4 100644 --- a/SurPyval/model/fitmodel.py +++ b/SurPyval/model/fitmodel.py @@ -1,4 +1,5 @@ from typing import Dict, Any +import numpy as np from SurPyval.node import NodeTree, Node from SurPyval.samplers import EmceeSampler @@ -34,11 +35,24 @@ def predict(self, node_dict: Dict[str, Node]): fitted_model = FitModel(fitted_node_tree, self.posterior) return fitted_model - def sample_survival(self): - def plot_survival_function(surv_node): - start_point = surv_node.distribution.ppf( 0.005, **parsed_n )[0] - end_point = surv_node.distribution.ppf( 0.995, **parsed_n )[0] - x_s = np.linspace( start_point, end_point, 1000 ) - vals = [surv_node.distribution.sf( x, **parsed_n )[0] for x in x_s] - from matplotlib import pyplot as plt - plt.plot( x_s, vals ) \ No newline at end of file + def survival_function(self, flat_parameter_array): + param_dict = self.node_tree.unflatten_parameter_array(flat_parameter_array) + + def surv_function(**kwargs): + return self.node_tree["y"].sf(**{**param_dict, **kwargs}) + + return surv_function + + def sample_survival_function(self, n_samples): + posterior_samples = self.posterior.sample(n_samples)[:n_samples] + return [self.survival_function(x) for x in posterior_samples] + + def plot_survival_function(self): + param_dict = self.node_tree.unflatten_parameter_array(self.posterior.maximum_likihood) + start_point = self.node_tree["y"].ppf(0.005, **param_dict) + end_point = self.node_tree["y"].ppf(0.995, **param_dict) + x_s = np.linspace(start_point, end_point, 1000) + surv_function = self.survival_function(self.posterior.maximum_likihood) + vals = surv_function(y=x_s) + from matplotlib import pyplot as plt + plt.plot(x_s, vals) diff --git a/SurPyval/model/model.py b/SurPyval/model/model.py index 0e0ca16..2a4860f 100644 --- a/SurPyval/model/model.py +++ b/SurPyval/model/model.py @@ -1,5 +1,4 @@ from scipy.optimize import minimize -import emcee as em import numpy as np from SurPyval.node.tree import NodeTree @@ -40,16 +39,8 @@ def fit(self, n_walkers: int =4, burn: int =500): The function has the side effect of populating self.posterior """ - def generate_starting_points(): - max_lik_point = self.maximum_likihood() - return [max_lik_point + np.random.normal(0, 0.01, int(self.node_tree.length())) for x in range(n_walkers)] - - ndim = self.node_tree.length() - sampler = em.EnsembleSampler(n_walkers, ndim, self.node_tree.logpdf) - p0 = generate_starting_points() - pos, prob, state = sampler.run_mcmc(p0, burn) - - posterior = EmceeSampler(sampler, pos) + posterior = EmceeSampler(self.node_tree.logpdf, self.maximum_likihood(), n_walkers) + posterior.sample(burn) return FitModel(self.node_tree, posterior) def maximum_likihood(self): diff --git a/SurPyval/node/datalikihoodnode.py b/SurPyval/node/datalikihoodnode.py index 02b195f..0ceb6a5 100644 --- a/SurPyval/node/datalikihoodnode.py +++ b/SurPyval/node/datalikihoodnode.py @@ -20,6 +20,6 @@ def __init__(self, distribution, data: Any, parameter_dict: Dict[str, str], cons def logpdf(self, **kwargs): processed_kwargs = self.parse_unflattened_parameters(**kwargs) - likihood_contribution = self.distribution.logpdf(kwargs["y"], **processed_kwargs) - survival_contribution = self.distribution.logsf(kwargs["y"], **processed_kwargs) + likihood_contribution = self.distribution.logpdf(self.data, **processed_kwargs) + survival_contribution = self.distribution.logsf(self.data, **processed_kwargs) return np.sum(likihood_contribution[kwargs["event"].astype(bool)]) + np.sum(survival_contribution[~kwargs["event"].astype(bool)]) \ No newline at end of file diff --git a/SurPyval/node/node.py b/SurPyval/node/node.py index cfe605d..764443d 100644 --- a/SurPyval/node/node.py +++ b/SurPyval/node/node.py @@ -48,7 +48,7 @@ def __init__(self, distribution: rv_continuous, parameter_dict: Dict[str, str], def parse_unflattened_parameters(self, **kwargs): filtered_kwargs = {x: kwargs[x] for x in kwargs if x in self.parameter_names} renamed_kwargs = {self.parameter_dict[x]: filtered_kwargs[x] for x in filtered_kwargs} - return {**renamed_kwargs, **self.constants_dict} + return {**self.constants_dict, **renamed_kwargs} def logpdf(self, **kwargs): return self.distribution.logpdf(**self.parse_unflattened_parameters(**kwargs)) @@ -56,8 +56,11 @@ def logpdf(self, **kwargs): def pdf(self, **kwargs): return self.distribution.logpdf(**self.parse_unflattened_parameters(**kwargs)) - def ppf(self, **kwargs): - return self.distribution.ppf(**self.parse_unflattened_parameters(**kwargs)) + def sf(self, **kwargs): + return self.distribution.sf(**self.parse_unflattened_parameters(**kwargs)) + + def ppf(self, q, **kwargs): + return self.distribution.ppf(q, **self.parse_unflattened_parameters(**kwargs)) def sample(self, size=1, **kwargs): """ diff --git a/SurPyval/node/tree.py b/SurPyval/node/tree.py index 3505770..8af2707 100644 --- a/SurPyval/node/tree.py +++ b/SurPyval/node/tree.py @@ -25,7 +25,7 @@ def __init__(self, node_dict: Dict[str, Node]): self.transformations: List[DeterministicNode] = [x for x in node_dict.values() if type(x) is DeterministicNode] self.likihood_nodes: List[DataLikihoodNode] = [x for x in node_dict.values() if type(x) is DataLikihoodNode] - self.data_dict = {x[0]: x[1].data for x in node_dict.items() if type(x[1]) is DataLikihoodNode or type(x[1]) is DataNode} + self.data_dict = {x[0]: x[1].data for x in node_dict.items() if type(x[1]) is type(x[1]) is DataNode} self.node_dict = node_dict self.node_names = sorted(node_dict.keys()) self.flat_split_point = self.flattened_parameter_split_points() diff --git a/SurPyval/parametric/fitted/exponential.py b/SurPyval/parametric/fitted/exponential.py index 50a6219..d253e77 100644 --- a/SurPyval/parametric/fitted/exponential.py +++ b/SurPyval/parametric/fitted/exponential.py @@ -22,7 +22,7 @@ class FittedExponential(Model): Fit an exponential distribution to the life lengths Nodes: - * llambda - the main parameter for the exponential distribution + * alpha - the scale of the exponential distribution * y - predictive distribution for lifetime Likihood: @@ -38,7 +38,7 @@ class FittedExponential(Model): def __init__(self, y, event, alpha_prior): node_dict = { "alpha": alpha_prior, - "y": DataLikihoodNode(scipy.stats.expon, y, {"alpha": "scale"}), + "y": DataLikihoodNode(scipy.stats.expon, y, {"alpha": "scale", "y": "x"}), "event": DataNode("event", event) } diff --git a/SurPyval/samplers/emceesampler.py b/SurPyval/samplers/emceesampler.py index 176b5c1..9519cf4 100644 --- a/SurPyval/samplers/emceesampler.py +++ b/SurPyval/samplers/emceesampler.py @@ -1,14 +1,23 @@ +import emcee as em +import numpy as np from SurPyval.samplers.sampler import Sampler + class EmceeSampler(Sampler): - - def __init__(self, sampler, pos): - self.sampler = sampler - self.pos = pos + + def __init__(self, lik_function, maximum_likihood, n_walkers=4): + self.lik_function = lik_function + self.n_walkers = n_walkers + self.maximum_likihood = maximum_likihood + self.pos = self.generate_starting_points() + self.sampler = em.EnsembleSampler(n_walkers, len(self.maximum_likihood), self.lik_function) def sample(self, n_samples): self.sampler.reset() - self.sampler.run_mcmc(self.pos, n_samples) + self.pos, prob, state = self.sampler.run_mcmc(self.pos, n_samples) return self.sampler.flatchain + def generate_starting_points(self): + max_lik_point = self.maximum_likihood + return [max_lik_point + np.random.normal(0, 0.01, len(self.maximum_likihood)) for x in range(self.n_walkers)] From 471cf460ff1f844387904bd61905620e897fdbef Mon Sep 17 00:00:00 2001 From: Jake Coltman Date: Mon, 15 Jan 2018 18:11:16 +0000 Subject: [PATCH 13/15] Allow survival function sampling in ExponentialRegression --- .idea/workspace.xml | 206 ++++++++++-------- SurPyval/model/model.py | 2 +- SurPyval/parametric/regression/exponential.py | 2 +- 3 files changed, 113 insertions(+), 97 deletions(-) diff --git a/.idea/workspace.xml b/.idea/workspace.xml index 3135479..96ef740 100644 --- a/.idea/workspace.xml +++ b/.idea/workspace.xml @@ -1,15 +1,10 @@ - + - - - - - - + @@ -34,11 +29,11 @@ - + - - + + @@ -61,8 +56,8 @@ - - + + @@ -73,8 +68,8 @@ - - + + @@ -82,21 +77,31 @@ + + + + + + + + + + - - + + - + - - + + @@ -104,14 +109,12 @@ - - + + - - - - - + + + @@ -119,8 +122,8 @@ - - + + @@ -128,11 +131,11 @@ - - + + - - + + @@ -140,16 +143,6 @@ - - - - - - - - - - @@ -173,6 +166,8 @@ append_transformations genera flattened_parameter_split_points + predict + nan @@ -214,15 +209,15 @@ @@ -309,6 +304,12 @@ + + + + + +