From 62e84ec5894824b7443f6be8e97f249ba1655293 Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Fri, 22 Mar 2024 23:04:41 -0400 Subject: [PATCH 01/10] Use environments for additional deployment safety Signed-off-by: Keith Battocchi Signed-off-by: Gabriel Daiha --- .github/workflows/ci.yml | 2 +- .github/workflows/publish-documentation.yml | 3 ++- .github/workflows/publish-package.yml | 13 +++++++------ 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6d9df3844..1ae55a779 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -346,7 +346,7 @@ jobs: uses: ./.github/workflows/publish-package.yml with: publish: false - repository: testpypi + environment: test # don't have access to env context here for some reason ref: ${{ github.event_name == 'workflow_dispatch' && inputs.ref || null }} # can't use env context here so need to duplicate expression, but these are true boolean values so don't need extra string logic diff --git a/.github/workflows/publish-documentation.yml b/.github/workflows/publish-documentation.yml index 499764cd6..0f9731433 100644 --- a/.github/workflows/publish-documentation.yml +++ b/.github/workflows/publish-documentation.yml @@ -91,11 +91,12 @@ jobs: publish-docs: name: Publish documentation - runs-on: ubuntu-latest needs: create_docs permissions: id-token: write # needed to publish to Azure + environment: ${{ inputs.environment }} if: ${{ inputs.publish }} + runs-on: ubuntu-latest steps: - name: Download docs artifact uses: actions/download-artifact@v3 diff --git a/.github/workflows/publish-package.yml b/.github/workflows/publish-package.yml index 7cd4322c2..d9dfbbdca 100644 --- a/.github/workflows/publish-package.yml +++ b/.github/workflows/publish-package.yml @@ -8,12 +8,12 @@ on: required: false default: true type: boolean - repository: + environment: description: 'Whether to publish to production PyPI or test PyPI' required: false - default: pypi + default: prod type: choice - options: [pypi, testpypi] + options: [prod, test] ref: description: 'The git ref to build the package for' required: false @@ -34,10 +34,10 @@ on: default: true type: boolean # choice type only supported for workflow_dispatch, not workflow_call - repository: + environment: description: 'Whether to publish to production PyPI or test PyPI' required: false - default: pypi + default: prod type: string ref: description: 'The git ref to build the package for' @@ -119,6 +119,7 @@ jobs: needs: [merge] permissions: id-token: write + environment: ${{ inputs.environment }} if: ${{ inputs.publish }} runs-on: ubuntu-latest steps: @@ -130,4 +131,4 @@ jobs: - name: Upload wheels and sdist to package index uses: pypa/gh-action-pypi-publish@release/v1 with: - repository-url: ${{ inputs.repository == 'testpypi' && 'https://test.pypi.org/legacy/' || '' }} + repository-url: ${{ inputs.environment == 'test' && 'https://test.pypi.org/legacy/' || '' }} From 4309e847cd1602680940a9ad2b9201ba07575de3 Mon Sep 17 00:00:00 2001 From: Gabriel Daiha Date: Fri, 10 May 2024 13:38:29 -0300 Subject: [PATCH 02/10] Avoid double impurity calculation Signed-off-by: Gabriel Daiha --- econml/tree/_criterion.pyx | 24 ++++++++++++++---------- econml/tree/_splitter.pyx | 15 ++++++++++----- 2 files changed, 24 insertions(+), 15 deletions(-) diff --git a/econml/tree/_criterion.pyx b/econml/tree/_criterion.pyx index d0b12a92f..c0b00d351 100644 --- a/econml/tree/_criterion.pyx +++ b/econml/tree/_criterion.pyx @@ -209,7 +209,9 @@ cdef class Criterion: return (- self.weighted_n_right * impurity_right - self.weighted_n_left * impurity_left) - cdef double impurity_improvement(self, double impurity) nogil: + cdef double impurity_improvement(self, float64_t impurity_parent, + float64_t impurity_left, + float64_t impurity_right) nogil: """Compute the improvement in impurity This method computes the improvement in impurity when a split occurs. The weighted impurity improvement equation is the following: @@ -218,22 +220,24 @@ cdef class Criterion: where N is the total number of samples, N_t is the number of samples at the current node, N_t_L is the number of samples in the left child, and N_t_R is the number of samples in the right child, + Parameters ---------- - impurity : double - The initial impurity of the node before the split + impurity_parent : float64_t + The initial impurity of the parent node before the split + + impurity_left : float64_t + The impurity of the left child + + impurity_right : float64_t + The impurity of the right child + Return ------ double : improvement in impurity after the split occurs """ - - cdef double impurity_left - cdef double impurity_right - - self.children_impurity(&impurity_left, &impurity_right) - return ((self.weighted_n_node_samples / self.weighted_n_samples) * - (impurity - (self.weighted_n_right / + (impurity_parent - (self.weighted_n_right / self.weighted_n_node_samples * impurity_right) - (self.weighted_n_left / self.weighted_n_node_samples * impurity_left))) diff --git a/econml/tree/_splitter.pyx b/econml/tree/_splitter.pyx index 557dc6e59..fb2d74fb0 100644 --- a/econml/tree/_splitter.pyx +++ b/econml/tree/_splitter.pyx @@ -615,11 +615,6 @@ cdef class BestSplitter(Splitter): if self.honest: self.criterion_val.reset() self.criterion_val.update(best.pos_val) - # Calculate a more accurate version of impurity improvement using the input baseline impurity - # passed here by the TreeBuilder. The TreeBuilder uses the proxy_node_impurity() to calculate - # this baseline if self.is_children_impurity_proxy(), else uses the call to children_impurity() - # on the parent node, when that node was split. - best.improvement = self.criterion.impurity_improvement(impurity) # if we need children impurities by the builder, then we populate these entries # otherwise, we leave them blank to avoid the extra computation. if not self.is_children_impurity_proxy(): @@ -630,6 +625,16 @@ cdef class BestSplitter(Splitter): else: best.impurity_left_val = best.impurity_left best.impurity_right_val = best.impurity_right + + # Calculate a more accurate version of impurity improvement using the input baseline impurity + # passed here by the TreeBuilder. The TreeBuilder uses the proxy_node_impurity() to calculate + # this baseline if self.is_children_impurity_proxy(), else uses the call to children_impurity() + # on the parent node, when that node was split. + best.improvement = self.criterion.impurity_improvement(impurity, + best.impurity_left, best.impurity_right) + + if self.honest: + # Respect invariant for constant features: the original order of # element in features[:n_known_constants] must be preserved for sibling From c00b8cf5e07893a4de227cdb7da428d850759c5e Mon Sep 17 00:00:00 2001 From: Gabriel Daiha Date: Fri, 10 May 2024 13:41:04 -0300 Subject: [PATCH 03/10] moved comment inside criterion.pyx Signed-off-by: Gabriel Daiha --- econml/tree/_splitter.pyx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/econml/tree/_splitter.pyx b/econml/tree/_splitter.pyx index fb2d74fb0..426a7e7b4 100644 --- a/econml/tree/_splitter.pyx +++ b/econml/tree/_splitter.pyx @@ -615,6 +615,10 @@ cdef class BestSplitter(Splitter): if self.honest: self.criterion_val.reset() self.criterion_val.update(best.pos_val) + # Calculate a more accurate version of impurity improvement using the input baseline impurity + # passed here by the TreeBuilder. The TreeBuilder uses the proxy_node_impurity() to calculate + # this baseline if self.is_children_impurity_proxy(), else uses the call to children_impurity() + # on the parent node, when that node was split. # if we need children impurities by the builder, then we populate these entries # otherwise, we leave them blank to avoid the extra computation. if not self.is_children_impurity_proxy(): @@ -626,10 +630,6 @@ cdef class BestSplitter(Splitter): best.impurity_left_val = best.impurity_left best.impurity_right_val = best.impurity_right - # Calculate a more accurate version of impurity improvement using the input baseline impurity - # passed here by the TreeBuilder. The TreeBuilder uses the proxy_node_impurity() to calculate - # this baseline if self.is_children_impurity_proxy(), else uses the call to children_impurity() - # on the parent node, when that node was split. best.improvement = self.criterion.impurity_improvement(impurity, best.impurity_left, best.impurity_right) From 3164b7c12fef41403431ffa53ecbeb6186290163 Mon Sep 17 00:00:00 2001 From: Gabriel Daiha Date: Fri, 10 May 2024 13:42:10 -0300 Subject: [PATCH 04/10] removed unused self.honest condition Signed-off-by: Gabriel Daiha --- econml/tree/_splitter.pyx | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/econml/tree/_splitter.pyx b/econml/tree/_splitter.pyx index 426a7e7b4..dd30e7586 100644 --- a/econml/tree/_splitter.pyx +++ b/econml/tree/_splitter.pyx @@ -631,10 +631,7 @@ cdef class BestSplitter(Splitter): best.impurity_right_val = best.impurity_right best.improvement = self.criterion.impurity_improvement(impurity, - best.impurity_left, best.impurity_right) - - if self.honest: - + best.impurity_left, best.impurity_right) # Respect invariant for constant features: the original order of # element in features[:n_known_constants] must be preserved for sibling From c88a3488eb0d1c97a1305bcab94cf71815a69ba4 Mon Sep 17 00:00:00 2001 From: Keith Battocchi Date: Fri, 3 May 2024 00:26:52 -0400 Subject: [PATCH 05/10] Use macos-12 instead of macos-latest to avoid ARM until dependencies are available. Signed-off-by: Keith Battocchi Signed-off-by: Gabriel Daiha --- .github/workflows/ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1ae55a779..4e02aa507 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -201,12 +201,12 @@ jobs: if: ${{ needs.eval.outputs.testCode == 'True' }} strategy: matrix: - os: [ubuntu-latest, windows-latest, macos-latest] + os: [ubuntu-latest, windows-latest, macos-12] python-version: ['3.8', '3.9', '3.10', '3.11', '3.12'] kind: [serial, other, dml, main, treatment, ray] exclude: # Serial tests fail randomly on mac sometimes, so we don't run them there - - os: macos-latest + - os: macos-12 kind: serial # Ray tests run out of memory on Windows - os: windows-latest From 390287c03dbc60ed796fb8b56a3289aa23a46b64 Mon Sep 17 00:00:00 2001 From: gdaiha <110778471+gdaiha@users.noreply.github.com> Date: Tue, 7 May 2024 05:23:11 -0300 Subject: [PATCH 06/10] Optimizing NormalInferenceResults confidence interval method speed (#879) * Fixed normal inference results confidence interval unnecessary loop Signed-off-by: gdaiha <110778471+gdaiha@users.noreply.github.com> Signed-off-by: Gabriel Daiha Signed-off-by: Gabriel Daiha --- econml/inference/_inference.py | 17 +++------------ econml/sklearn_extensions/linear_model.py | 26 +++++++++-------------- 2 files changed, 13 insertions(+), 30 deletions(-) diff --git a/econml/inference/_inference.py b/econml/inference/_inference.py index 23658f846..438151943 100644 --- a/econml/inference/_inference.py +++ b/econml/inference/_inference.py @@ -1066,14 +1066,9 @@ def conf_int(self, alpha=0.05): """ if self.stderr is None: raise AttributeError("Only point estimates are available!") - if np.isscalar(self.point_estimate): + else: return _safe_norm_ppf(alpha / 2, loc=self.point_estimate, scale=self.stderr), \ _safe_norm_ppf(1 - alpha / 2, loc=self.point_estimate, scale=self.stderr) - else: - return np.array([_safe_norm_ppf(alpha / 2, loc=p, scale=err) - for p, err in zip(self.point_estimate, self.stderr)]), \ - np.array([_safe_norm_ppf(1 - alpha / 2, loc=p, scale=err) - for p, err in zip(self.point_estimate, self.stderr)]) def pvalue(self, value=0): """ @@ -1398,14 +1393,8 @@ def conf_int_mean(self, *, alpha=None): alpha = self.alpha if alpha is None else alpha mean_point = self.mean_point stderr_mean = self.stderr_mean - if np.isscalar(mean_point): - return (_safe_norm_ppf(alpha / 2, loc=mean_point, scale=stderr_mean), - _safe_norm_ppf(1 - alpha / 2, loc=mean_point, scale=stderr_mean)) - else: - return np.array([_safe_norm_ppf(alpha / 2, loc=p, scale=err) - for p, err in zip(mean_point, stderr_mean)]), \ - np.array([_safe_norm_ppf(1 - alpha / 2, loc=p, scale=err) - for p, err in zip(mean_point, stderr_mean)]) + return (_safe_norm_ppf(alpha / 2, loc=mean_point, scale=stderr_mean), + _safe_norm_ppf(1 - alpha / 2, loc=mean_point, scale=stderr_mean)) @property def std_point(self): diff --git a/econml/sklearn_extensions/linear_model.py b/econml/sklearn_extensions/linear_model.py index c95f392de..a16b3d852 100644 --- a/econml/sklearn_extensions/linear_model.py +++ b/econml/sklearn_extensions/linear_model.py @@ -1627,10 +1627,8 @@ def coef__interval(self, alpha=0.05): coef__interval : {tuple ((p, d) array, (p,d) array), tuple ((d,) array, (d,) array)} The lower and upper bounds of the confidence interval of the coefficients """ - return np.array([_safe_norm_ppf(alpha / 2, loc=p, scale=err) - for p, err in zip(self.coef_, self.coef_stderr_)]), \ - np.array([_safe_norm_ppf(1 - alpha / 2, loc=p, scale=err) - for p, err in zip(self.coef_, self.coef_stderr_)]) + return (_safe_norm_ppf(alpha / 2, loc=self.coef_, scale=self.coef_stderr_), + _safe_norm_ppf(1 - alpha / 2, loc=self.coef_, scale=self.coef_stderr_)) def intercept__interval(self, alpha=0.05): """ @@ -1651,14 +1649,8 @@ def intercept__interval(self, alpha=0.05): return (0 if self._n_out == 0 else np.zeros(self._n_out)), \ (0 if self._n_out == 0 else np.zeros(self._n_out)) - if self._n_out == 0: - return _safe_norm_ppf(alpha / 2, loc=self.intercept_, scale=self.intercept_stderr_), \ - _safe_norm_ppf(1 - alpha / 2, loc=self.intercept_, scale=self.intercept_stderr_) - else: - return np.array([_safe_norm_ppf(alpha / 2, loc=p, scale=err) - for p, err in zip(self.intercept_, self.intercept_stderr_)]), \ - np.array([_safe_norm_ppf(1 - alpha / 2, loc=p, scale=err) - for p, err in zip(self.intercept_, self.intercept_stderr_)]) + return (_safe_norm_ppf(alpha / 2, loc=self.intercept_, scale=self.intercept_stderr_), + _safe_norm_ppf(1 - alpha / 2, loc=self.intercept_, scale=self.intercept_stderr_)) def predict_interval(self, X, alpha=0.05): """ @@ -1677,10 +1669,12 @@ def predict_interval(self, X, alpha=0.05): prediction_intervals : {tuple ((n,) array, (n,) array), tuple ((n,p) array, (n,p) array)} The lower and upper bounds of the confidence intervals of the predicted mean outcomes """ - return np.array([_safe_norm_ppf(alpha / 2, loc=p, scale=err) - for p, err in zip(self.predict(X), self.prediction_stderr(X))]), \ - np.array([_safe_norm_ppf(1 - alpha / 2, loc=p, scale=err) - for p, err in zip(self.predict(X), self.prediction_stderr(X))]) + + pred = self.predict(X) + pred_stderr = self.prediction_stderr(X) + + return (_safe_norm_ppf(alpha / 2, loc=pred, scale=pred_stderr), + _safe_norm_ppf(1 - alpha / 2, loc=pred, scale=pred_stderr)) class StatsModelsLinearRegression(_StatsModelsWrapper): From 546ecc8f8bb9788ea1b7fd348dd41b119d9c7db0 Mon Sep 17 00:00:00 2001 From: Kevin Gao Date: Tue, 7 May 2024 14:24:43 -0400 Subject: [PATCH 07/10] Fix flaky RScorer test (#876) Increase sample size to reduce errors Signed-off-by: kgao Signed-off-by: Gabriel Daiha --- econml/tests/test_rscorer.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/econml/tests/test_rscorer.py b/econml/tests/test_rscorer.py index 9ca2d5c50..4f41b5016 100644 --- a/econml/tests/test_rscorer.py +++ b/econml/tests/test_rscorer.py @@ -21,9 +21,9 @@ def _fit_model(name, model, Y, T, X): class TestRScorer(unittest.TestCase): def _get_data(self): - X = np.random.normal(0, 1, size=(2000, 2)) - T = np.random.binomial(1, .5, size=(2000,)) - y = X[:, 0] * T + np.random.normal(size=(2000,)) + X = np.random.normal(0, 1, size=(100000, 2)) + T = np.random.binomial(1, .5, size=(100000,)) + y = X[:, 0] * T + np.random.normal(size=(100000,)) return y, T, X, X[:, 0] def test_comparison(self): From 727bba92cf9ee6cd59618b985ffa9adbf040e184 Mon Sep 17 00:00:00 2001 From: Gabriel Daiha Date: Fri, 10 May 2024 13:59:05 -0300 Subject: [PATCH 08/10] fixed criterion.pxd with new signature Signed-off-by: Gabriel Daiha --- econml/tree/_criterion.pxd | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/econml/tree/_criterion.pxd b/econml/tree/_criterion.pxd index 2eb6b8ba3..c0125d788 100644 --- a/econml/tree/_criterion.pxd +++ b/econml/tree/_criterion.pxd @@ -78,7 +78,9 @@ cdef class Criterion: cdef void node_value(self, double* dest) nogil cdef void node_jacobian(self, double* dest) nogil cdef void node_precond(self, double* dest) nogil - cdef double impurity_improvement(self, double impurity) nogil + cdef double impurity_improvement(self, double impurity_parent, + double impurity_left, + double impurity_right) nogil cdef double proxy_impurity_improvement(self) nogil cdef double min_eig_left(self) nogil cdef double min_eig_right(self) nogil From 0df03b6d2d3ccb94294792ce16368a009f856d6a Mon Sep 17 00:00:00 2001 From: Gabriel Daiha Date: Fri, 10 May 2024 14:01:47 -0300 Subject: [PATCH 09/10] fixed types in impurity_improvement definition in criterion Signed-off-by: Gabriel Daiha --- econml/tree/_criterion.pyx | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/econml/tree/_criterion.pyx b/econml/tree/_criterion.pyx index c0b00d351..a2f9acdb3 100644 --- a/econml/tree/_criterion.pyx +++ b/econml/tree/_criterion.pyx @@ -209,9 +209,9 @@ cdef class Criterion: return (- self.weighted_n_right * impurity_right - self.weighted_n_left * impurity_left) - cdef double impurity_improvement(self, float64_t impurity_parent, - float64_t impurity_left, - float64_t impurity_right) nogil: + cdef double impurity_improvement(self, double impurity_parent, + double impurity_left, + double impurity_right) nogil: """Compute the improvement in impurity This method computes the improvement in impurity when a split occurs. The weighted impurity improvement equation is the following: From 1d44cef36fe2736d333f52f5bccbc87b70a41c2c Mon Sep 17 00:00:00 2001 From: Gabriel Daiha Date: Fri, 10 May 2024 14:32:18 -0300 Subject: [PATCH 10/10] removed best.improvement from is_children_impurity_proxy condition Signed-off-by: Gabriel Daiha --- econml/tree/_splitter.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/econml/tree/_splitter.pyx b/econml/tree/_splitter.pyx index dd30e7586..d022c2038 100644 --- a/econml/tree/_splitter.pyx +++ b/econml/tree/_splitter.pyx @@ -630,8 +630,8 @@ cdef class BestSplitter(Splitter): best.impurity_left_val = best.impurity_left best.impurity_right_val = best.impurity_right - best.improvement = self.criterion.impurity_improvement(impurity, - best.impurity_left, best.impurity_right) + best.improvement = self.criterion.impurity_improvement(impurity, + best.impurity_left, best.impurity_right) # Respect invariant for constant features: the original order of # element in features[:n_known_constants] must be preserved for sibling