diff --git a/doc/spec/estimation/dml.rst b/doc/spec/estimation/dml.rst
index 536419abd..200d9d677 100644
--- a/doc/spec/estimation/dml.rst
+++ b/doc/spec/estimation/dml.rst
@@ -52,11 +52,11 @@ characteristics :math:`X` of the treated samples, then one can use this method.
 
     # DML
     import numpy as np
-    X = np.random.choice(np.arange(5), size=(100,3))
+    X = np.random.choice(6, size=(100,3))
     Y = np.random.normal(size=(100,2))
     y = np.random.normal(size=(100,))
-    T = T0 = T1 = np.random.choice(np.arange(3), size=(100,2))
-    t = t0 = t1 = T[:,0]
+    (T, T0, T1) = (np.random.choice(np.arange(3), size=(100,2)) for _ in range(3))
+    (t, t0, t1) = (a[:,0] for a in (T, T0, T1))
     W = np.random.normal(size=(100,2))
 
 .. testcode::
@@ -646,7 +646,7 @@ Then we can estimate the coefficients :math:`\alpha_i` by running:
     from sklearn.preprocessing import PolynomialFeatures
     est = LinearDML(model_y=RandomForestRegressor(),
                     model_t=RandomForestRegressor(),
-                    featurizer=PolynomialFeatures(degree=3, include_bias=True))
+                    featurizer=PolynomialFeatures(degree=3, include_bias=False))
     est.fit(y, T, X=X, W=W)
 
     # To get the coefficients of the polynomial fitted in the final stage we can
@@ -663,7 +663,7 @@ To add fixed effect heterogeneity, we can create one-hot encodings of the id, wh
     from econml.dml import LinearDML
     from sklearn.preprocessing import OneHotEncoder
     # removing one id to avoid colinearity, as is standard for fixed effects
-    X_oh = OneHotEncoder(sparse_output=False).fit_transform(X)[:, 1:]
+    X_oh = OneHotEncoder(sparse_output=False, drop="first").fit_transform(X)
 
     est = LinearDML(model_y=RandomForestRegressor(),
                                  model_t=RandomForestRegressor())
diff --git a/econml/iv/sieve/_tsls.py b/econml/iv/sieve/_tsls.py
index ffed3350d..0439d4b39 100644
--- a/econml/iv/sieve/_tsls.py
+++ b/econml/iv/sieve/_tsls.py
@@ -151,7 +151,7 @@ def transform(self, X):
         for i in range(X.shape[1]):
             p = powers.copy()
             c = powers[:, i]
-            p[:, i] -= 1
+            p[p[:, i] > 0, i] -= 1
             M = np.float_power(X[:, np.newaxis, :], p[np.newaxis, :, :])
             result[:, i, :] = c[np.newaxis, :] * np.prod(M, axis=-1)
         return result
diff --git a/econml/tests/test_treatment_featurization.py b/econml/tests/test_treatment_featurization.py
index 068a07847..a3cf55fe7 100644
--- a/econml/tests/test_treatment_featurization.py
+++ b/econml/tests/test_treatment_featurization.py
@@ -486,20 +486,21 @@ def test_featurization(self):
     def test_jac(self):
         def func_transform(x):
             x = x.reshape(-1, 1)
-            return np.hstack([x, x**2])
+            return np.hstack([np.ones_like(x), x, x**2])
 
         def calc_expected_jacobian(T):
-            jac = DPolynomialFeatures(degree=2, include_bias=False).fit_transform(T)
+            jac = DPolynomialFeatures(degree=2, include_bias=True).fit_transform(T)
             return jac
 
         treatment_featurizers = [
-            PolynomialFeatures(degree=2, include_bias=False),
+            PolynomialFeatures(degree=2, include_bias=True),
             FunctionTransformer(func=func_transform)
         ]
 
         n = 10000
         d_t = 1
         T = np.random.normal(size=(n, d_t))
+        T[0, 0] = 0  # hardcode one value of exactly zero to test that we don't generate nan
 
         for treatment_featurizer in treatment_featurizers:
             # fit a dummy estimator first so the featurizer can be fit to the treatment
diff --git a/econml/utilities.py b/econml/utilities.py
index e8fc18867..90a22a089 100644
--- a/econml/utilities.py
+++ b/econml/utilities.py
@@ -1460,9 +1460,15 @@ def jac(self, X, epsilon=0.001):
             powers = self.featurizer.powers_
             result = np.zeros(X.shape + (self.featurizer.n_output_features_,))
             for i in range(X.shape[1]):
-                p = powers.copy()
+                # d/dx_i of x_1^p_1 * x_2^p_2 * ... x_i^p_i * ... x_n^p_n
+                # = p_i * x_1^p_1 * x_2^p_2 * ... x_i^(p_i-1) * ... x_n^p_n
+                # store the coefficient in c, and the updated powers in p
                 c = powers[:, i]
-                p[:, i] -= 1
+                p = powers.copy()
+                # decrement p[:, i], but only if it was more than 0 already
+                # (if it is 0 then c=0 so we'll correctly get 0 for a result regardless of the updated entries in p,
+                #  but float_power would return nan if X has a 0 and the exponent is -1)
+                p[p[:, i] > 0, i] -= 1
                 M = np.float_power(X[:, np.newaxis, :], p[np.newaxis, :, :])
                 result[:, i, :] = c[np.newaxis, :] * np.prod(M, axis=-1)
             return result