From 40297a443e8b6694beb2ef75c8a00ca209133a99 Mon Sep 17 00:00:00 2001
From: Richard Pelgrim <68642378+rrpelgrim@users.noreply.github.com>
Date: Tue, 14 Jun 2022 13:39:28 +0200
Subject: [PATCH 1/5] fix things on clean branch

---
 mongodb-with-coiled/dask-mongo.ipynb | 259 +++++++++++++--------------
 mongodb-with-coiled/environment.yml  |   2 +-
 2 files changed, 127 insertions(+), 134 deletions(-)

diff --git a/mongodb-with-coiled/dask-mongo.ipynb b/mongodb-with-coiled/dask-mongo.ipynb
index 0d36dec..8279ff6 100644
--- a/mongodb-with-coiled/dask-mongo.ipynb
+++ b/mongodb-with-coiled/dask-mongo.ipynb
@@ -55,30 +55,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "fa1c3b19-feff-4f92-8752-079e277949a7",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Creating new software environment\n",
-      "Creating new ecr build\n",
-      "STEP 1: FROM coiled/default:sha-6b4e896\n",
-      "STEP 2: COPY environment.yml environment.yml\n",
-      "--> 7cf6c61c926\n",
-      "STEP 3: RUN conda env update -n coiled -f environment.yml     && rm environment.yml     && conda clean --all -y     && echo \"conda activate coiled\" >> ~/.bashrc\n",
-      "Collecting package metadata (repodata.json): ...working... done\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "coiled.create_software_environment(\n",
-    "    account=\"coiled-examples\",\n",
-    "    name=\"dask-mongo\",\n",
-    "    conda=\"/Users/rpelgrim/Documents/git/coiled-resources/mongodb-with-coiled/environment.yml\",\n",
-    ")"
+    "# coiled.create_software_environment(\n",
+    "#     account=\"coiled-examples\",\n",
+    "#     name=\"dask-mongo\",\n",
+    "#     conda=\"/Users/rpelgrim/Documents/git/coiled-resources/mongodb-with-coiled/environment.yml\",\n",
+    "# )"
    ]
   },
   {
@@ -90,7 +76,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "1085c0ea11494440badf92e75b1859a0",
+       "model_id": "fbd02af99d1d4e26acf8b5ddbb8569aa",
        "version_major": 2,
        "version_minor": 0
       },
@@ -135,7 +121,24 @@
    "execution_count": 4,
    "id": "cef41594-b06a-4440-93f8-da727b294181",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/distributed/client.py:1265: VersionMismatchWarning: Mismatched versions found\n",
+      "\n",
+      "+---------+--------+-----------+---------+\n",
+      "| Package | client | scheduler | workers |\n",
+      "+---------+--------+-----------+---------+\n",
+      "| msgpack | 1.0.4  | 1.0.3     | 1.0.3   |\n",
+      "+---------+--------+-----------+---------+\n",
+      "Notes: \n",
+      "-  msgpack: Variation is ok, as long as everything is above 0.6\n",
+      "  warnings.warn(version_module.VersionMismatchWarning(msg[0][\"warning\"]))\n"
+     ]
+    }
+   ],
    "source": [
     "from dask.distributed import Client\n",
     "client = Client(cluster)"
@@ -152,7 +155,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
    "id": "4f2669ed-1fc2-42fa-87dd-2f191f804d32",
    "metadata": {},
    "outputs": [
@@ -171,7 +174,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
    "id": "7fff2141-43d8-4aa5-8aad-169cbc55d301",
    "metadata": {},
    "outputs": [],
@@ -192,7 +195,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
    "id": "eaa90416-45bd-4965-8770-f3219e7882c2",
    "metadata": {},
    "outputs": [],
@@ -209,7 +212,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 8,
    "id": "d321e022-f68b-4842-b28b-ef930e7593c5",
    "metadata": {},
    "outputs": [
@@ -219,7 +222,7 @@
        "dask.bag<read_mongo, npartitions=12>"
       ]
      },
-     "execution_count": 10,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -241,7 +244,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 9,
    "id": "59fed42a-b8d0-4a2e-8c38-88231d8a56fa",
    "metadata": {},
    "outputs": [
@@ -665,7 +668,7 @@
        "    'comments': \"The house was extremely well located and Ana was able to give us some really great tips on locations to have lunch and eat out. The house was perfectly clean and the easily able to accommodate 6 people despite only having one bathroom. The beds and living room were comfortable. \\n\\nHowever, we always felt somewhat on edge in the house due to the number of signs posted around the kitchen, bedrooms and bathroom about being charged 15€ for all sorts of extras like not washing up or using extra towels and bed linen. Not that this would be particularly unreasonable but it made us feel like we were walking on egg shells in and around the house. \\n\\nThe hosts were aware that we were a group of six yet one of the beds was not prepared and we ran out of toilet paper well before we were due to check out despite only being there 2 nights. It really wasn't the end of the world but the shower head does not have a wall fitting meaning you had to hold it yourself if you wanted to stand underneath it.\"}]},)"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 9,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -689,7 +692,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 10,
    "id": "b5002201-31ba-4e64-9961-4f37359e560a",
    "metadata": {},
    "outputs": [],
@@ -710,7 +713,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 11,
    "id": "31fff276-d1d8-468d-b893-4835ce26ae09",
    "metadata": {},
    "outputs": [],
@@ -725,7 +728,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 12,
    "id": "bbd3b6d6-f22e-40de-af79-9dcc2b95d9dc",
    "metadata": {},
    "outputs": [
@@ -738,7 +741,7 @@
        "  'review_rating': 94})"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 12,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -761,7 +764,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 13,
    "id": "ca7492b7-bf82-4111-b12a-5b8b101afd63",
    "metadata": {},
    "outputs": [],
@@ -773,7 +776,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 14,
    "id": "54347e2e-6b91-440d-9e87-28c6a4c26a32",
    "metadata": {},
    "outputs": [],
@@ -784,7 +787,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 15,
    "id": "090b7238-6ddc-45ba-bf1a-7eb9fb0ba96b",
    "metadata": {},
    "outputs": [],
@@ -803,7 +806,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 16,
    "id": "6d717e86-aa92-4183-b464-a5666c9e4ee8",
    "metadata": {},
    "outputs": [
@@ -853,7 +856,7 @@
        "1  Murphy bed, optional second bedroom available....             94"
       ]
      },
-     "execution_count": 17,
+     "execution_count": 16,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -873,7 +876,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 17,
    "id": "6572a460-b3e3-4c44-9e71-2e4573c6fe2c",
    "metadata": {},
    "outputs": [
@@ -923,7 +926,7 @@
        "1  murphy bed, optional second bedroom available....             94"
       ]
      },
-     "execution_count": 18,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -945,7 +948,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 18,
    "id": "ceb3f5b5-c41a-4d67-8c9a-352b9321ef42",
    "metadata": {},
    "outputs": [
@@ -1013,7 +1016,7 @@
        "4  [clean, fully, furnish, spacious, 1, bedroom, ...            100"
       ]
      },
-     "execution_count": 19,
+     "execution_count": 18,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1048,7 +1051,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 19,
    "id": "7c50c7c7-b515-4e51-955d-8e6de32a4e12",
    "metadata": {},
    "outputs": [],
@@ -1058,7 +1061,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 20,
    "id": "848a8eab-92ff-4003-bc7f-1d5c9134ecff",
    "metadata": {},
    "outputs": [],
@@ -1080,7 +1083,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 21,
    "id": "b3079756-79de-4d5d-b835-831d305cb122",
    "metadata": {},
    "outputs": [],
@@ -1094,7 +1097,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 22,
    "id": "63378d46-eaaf-45e7-85a3-a234d12fdcba",
    "metadata": {},
    "outputs": [],
@@ -1107,7 +1110,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 23,
    "id": "97e8065f-401a-4201-bb6e-c3278009da12",
    "metadata": {},
    "outputs": [],
@@ -1118,7 +1121,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 24,
    "id": "5eb5efac-60c3-4816-9e50-ec3bef6f6fec",
    "metadata": {},
    "outputs": [
@@ -1186,7 +1189,7 @@
        "4  [clean, fully, furnish, spacious, 1, bedroom, ...            100"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1210,7 +1213,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 25,
    "id": "d126152c-d608-4ccf-996a-e2073ee49bbb",
    "metadata": {},
    "outputs": [],
@@ -1223,10 +1226,76 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 28,
+   "id": "151ca35d-74c1-4653-bc30-00b7b234f4cf",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "cannot import name '_check_param_grid' from 'sklearn.model_selection._search' (/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/sklearn/model_selection/_search.py)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Input \u001b[0;32mIn [28]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdask_ml\u001b[39;00m\n\u001b[1;32m      2\u001b[0m dask_ml\u001b[38;5;241m.\u001b[39m__version__\n",
+      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/__init__.py:4\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpkg_resources\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DistributionNotFound, get_distribution\n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m# Ensure we always register tokenizers\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdask_ml\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _normalize\n\u001b[1;32m      6\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
+      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/model_selection/__init__.py:8\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_hyperband\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HyperbandSearchCV\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_incremental\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IncrementalSearchCV, InverseDecaySearchCV\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_search\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GridSearchCV, RandomizedSearchCV, check_cv, compute_n_splits\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_split\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m KFold, ShuffleSplit, train_test_split\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_successive_halving\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SuccessiveHalvingSearchCV\n",
+      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/model_selection/_search.py:20\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     18\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseEstimator, MetaEstimatorMixin, clone, is_classifier\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexceptions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NotFittedError\n\u001b[0;32m---> 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_search\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseSearchCV, _check_param_grid\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_split\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m     22\u001b[0m     BaseShuffleSplit,\n\u001b[1;32m     23\u001b[0m     KFold,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     31\u001b[0m     _CVIterableWrapper,\n\u001b[1;32m     32\u001b[0m )\n\u001b[1;32m     33\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipeline\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FeatureUnion, Pipeline\n",
+      "\u001b[0;31mImportError\u001b[0m: cannot import name '_check_param_grid' from 'sklearn.model_selection._search' (/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/sklearn/model_selection/_search.py)"
+     ]
+    }
+   ],
+   "source": [
+    "import dask_ml\n",
+    "dask_ml.__version__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "4428342f-81db-4d4c-aea4-9d44f9efff22",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "cannot import name '_check_param_grid' from 'sklearn.model_selection._search' (/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/sklearn/model_selection/_search.py)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Input \u001b[0;32mIn [27]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdask_ml\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n",
+      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/__init__.py:4\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpkg_resources\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DistributionNotFound, get_distribution\n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m# Ensure we always register tokenizers\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdask_ml\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _normalize\n\u001b[1;32m      6\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
+      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/model_selection/__init__.py:8\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_hyperband\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HyperbandSearchCV\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_incremental\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IncrementalSearchCV, InverseDecaySearchCV\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_search\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GridSearchCV, RandomizedSearchCV, check_cv, compute_n_splits\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_split\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m KFold, ShuffleSplit, train_test_split\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_successive_halving\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SuccessiveHalvingSearchCV\n",
+      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/model_selection/_search.py:20\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     18\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseEstimator, MetaEstimatorMixin, clone, is_classifier\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexceptions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NotFittedError\n\u001b[0;32m---> 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_search\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseSearchCV, _check_param_grid\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_split\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m     22\u001b[0m     BaseShuffleSplit,\n\u001b[1;32m     23\u001b[0m     KFold,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     31\u001b[0m     _CVIterableWrapper,\n\u001b[1;32m     32\u001b[0m )\n\u001b[1;32m     33\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipeline\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FeatureUnion, Pipeline\n",
+      "\u001b[0;31mImportError\u001b[0m: cannot import name '_check_param_grid' from 'sklearn.model_selection._search' (/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/sklearn/model_selection/_search.py)"
+     ]
+    }
+   ],
+   "source": [
+    "from dask_ml.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
    "id": "222c57c9-c493-41b8-ac18-18e25b50c63f",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "cannot import name '_check_param_grid' from 'sklearn.model_selection._search' (/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/sklearn/model_selection/_search.py)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Input \u001b[0;32mIn [26]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# create train/test splits\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdask_ml\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[1;32m      3\u001b[0m X_train, X_test, y_train, y_test \u001b[38;5;241m=\u001b[39m train_test_split(\n\u001b[1;32m      4\u001b[0m     X, \n\u001b[1;32m      5\u001b[0m     y, \n\u001b[1;32m      6\u001b[0m     test_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, \n\u001b[1;32m      7\u001b[0m     random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m40\u001b[39m,\n\u001b[1;32m      8\u001b[0m )\n",
+      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/__init__.py:4\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpkg_resources\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DistributionNotFound, get_distribution\n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m# Ensure we always register tokenizers\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdask_ml\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _normalize\n\u001b[1;32m      6\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
+      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/model_selection/__init__.py:8\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_hyperband\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HyperbandSearchCV\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_incremental\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IncrementalSearchCV, InverseDecaySearchCV\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_search\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GridSearchCV, RandomizedSearchCV, check_cv, compute_n_splits\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_split\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m KFold, ShuffleSplit, train_test_split\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_successive_halving\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SuccessiveHalvingSearchCV\n",
+      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/model_selection/_search.py:20\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     18\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseEstimator, MetaEstimatorMixin, clone, is_classifier\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexceptions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NotFittedError\n\u001b[0;32m---> 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_search\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseSearchCV, _check_param_grid\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_split\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m     22\u001b[0m     BaseShuffleSplit,\n\u001b[1;32m     23\u001b[0m     KFold,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     31\u001b[0m     _CVIterableWrapper,\n\u001b[1;32m     32\u001b[0m )\n\u001b[1;32m     33\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipeline\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FeatureUnion, Pipeline\n",
+      "\u001b[0;31mImportError\u001b[0m: cannot import name '_check_param_grid' from 'sklearn.model_selection._search' (/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/sklearn/model_selection/_search.py)"
+     ]
+    }
+   ],
    "source": [
     "# create train/test splits\n",
     "from dask_ml.model_selection import train_test_split\n",
@@ -1248,7 +1317,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "id": "92bc2fc9-cd96-415a-b088-9f9bcf479e00",
    "metadata": {},
    "outputs": [],
@@ -1269,86 +1338,10 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": null,
    "id": "5f6a755a-2721-4150-9723-e7a409a3e8e5",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<table>\n",
-       "    <tr>\n",
-       "        <td>\n",
-       "            <table>\n",
-       "                <thead>\n",
-       "                    <tr>\n",
-       "                        <td> </td>\n",
-       "                        <th> Array </th>\n",
-       "                        <th> Chunk </th>\n",
-       "                    </tr>\n",
-       "                </thead>\n",
-       "                <tbody>\n",
-       "                    \n",
-       "                    <tr>\n",
-       "                        <th> Shape </th>\n",
-       "                        <td> (2139, 1048576) </td>\n",
-       "                        <td> (211, 1048576) </td>\n",
-       "                    </tr>\n",
-       "                    <tr>\n",
-       "                        <th> Count </th>\n",
-       "                        <td> 108 Tasks </td>\n",
-       "                        <td> 12 Chunks </td>\n",
-       "                    </tr>\n",
-       "                    <tr>\n",
-       "                    <th> Type </th>\n",
-       "                    <td> float64 </td>\n",
-       "                    <td> scipy.sparse.csr.csr_matrix </td>\n",
-       "                    </tr>\n",
-       "                </tbody>\n",
-       "            </table>\n",
-       "        </td>\n",
-       "        <td>\n",
-       "        <svg width=\"170\" height=\"75\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
-       "\n",
-       "  <!-- Horizontal lines -->\n",
-       "  <line x1=\"0\" y1=\"0\" x2=\"120\" y2=\"0\" style=\"stroke-width:2\" />\n",
-       "  <line x1=\"0\" y1=\"2\" x2=\"120\" y2=\"2\" />\n",
-       "  <line x1=\"0\" y1=\"4\" x2=\"120\" y2=\"4\" />\n",
-       "  <line x1=\"0\" y1=\"6\" x2=\"120\" y2=\"6\" />\n",
-       "  <line x1=\"0\" y1=\"9\" x2=\"120\" y2=\"9\" />\n",
-       "  <line x1=\"0\" y1=\"11\" x2=\"120\" y2=\"11\" />\n",
-       "  <line x1=\"0\" y1=\"13\" x2=\"120\" y2=\"13\" />\n",
-       "  <line x1=\"0\" y1=\"15\" x2=\"120\" y2=\"15\" />\n",
-       "  <line x1=\"0\" y1=\"17\" x2=\"120\" y2=\"17\" />\n",
-       "  <line x1=\"0\" y1=\"19\" x2=\"120\" y2=\"19\" />\n",
-       "  <line x1=\"0\" y1=\"20\" x2=\"120\" y2=\"20\" />\n",
-       "  <line x1=\"0\" y1=\"23\" x2=\"120\" y2=\"23\" />\n",
-       "  <line x1=\"0\" y1=\"25\" x2=\"120\" y2=\"25\" style=\"stroke-width:2\" />\n",
-       "\n",
-       "  <!-- Vertical lines -->\n",
-       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"25\" style=\"stroke-width:2\" />\n",
-       "  <line x1=\"120\" y1=\"0\" x2=\"120\" y2=\"25\" style=\"stroke-width:2\" />\n",
-       "\n",
-       "  <!-- Colored Rectangle -->\n",
-       "  <polygon points=\"0.0,0.0 120.0,0.0 120.0,25.412616514582485 0.0,25.412616514582485\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
-       "\n",
-       "  <!-- Text -->\n",
-       "  <text x=\"60.000000\" y=\"45.412617\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >1048576</text>\n",
-       "  <text x=\"140.000000\" y=\"12.706308\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,140.000000,12.706308)\">2139</text>\n",
-       "</svg>\n",
-       "        </td>\n",
-       "    </tr>\n",
-       "</table>"
-      ],
-      "text/plain": [
-       "dask.array<_transformer, shape=(2139, 1048576), dtype=float64, chunksize=(211, 1048576), chunktype=scipy.csr_matrix>"
-      ]
-     },
-     "execution_count": 18,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "# make sure to compute the chunk sizes before training\n",
     "X_train_vect.compute_chunk_sizes()"
@@ -1356,7 +1349,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": null,
    "id": "d559a8b2-4888-45db-a33e-235091ea8699",
    "metadata": {},
    "outputs": [],
diff --git a/mongodb-with-coiled/environment.yml b/mongodb-with-coiled/environment.yml
index 0b7d098..36bd739 100644
--- a/mongodb-with-coiled/environment.yml
+++ b/mongodb-with-coiled/environment.yml
@@ -6,7 +6,7 @@ dependencies:
   - python==3.9.10
   - coiled==0.2.6
   - pandas==1.4.1
-  - dask[complete]==2022.02.0
+  - dask[complete]==2022.05.0
   - nltk==3.6.5
   - spacy==3.2.0
   - pyarrow

From 58efe98ddc960dfc410eae92d5f37e918cc97164 Mon Sep 17 00:00:00 2001
From: Richard Pelgrim <68642378+rrpelgrim@users.noreply.github.com>
Date: Tue, 14 Jun 2022 15:37:23 +0200
Subject: [PATCH 2/5] Update notebook to write data back to mongodb

---
 mongodb-with-coiled/dask-mongo.ipynb | 987 +++++++++++++++++++--------
 1 file changed, 689 insertions(+), 298 deletions(-)

diff --git a/mongodb-with-coiled/dask-mongo.ipynb b/mongodb-with-coiled/dask-mongo.ipynb
index 8279ff6..f358988 100644
--- a/mongodb-with-coiled/dask-mongo.ipynb
+++ b/mongodb-with-coiled/dask-mongo.ipynb
@@ -76,7 +76,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "fbd02af99d1d4e26acf8b5ddbb8569aa",
+       "model_id": "55ae007800324f6996b66a699a648d08",
        "version_major": 2,
        "version_minor": 0
       },
@@ -121,24 +121,7 @@
    "execution_count": 4,
    "id": "cef41594-b06a-4440-93f8-da727b294181",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/distributed/client.py:1265: VersionMismatchWarning: Mismatched versions found\n",
-      "\n",
-      "+---------+--------+-----------+---------+\n",
-      "| Package | client | scheduler | workers |\n",
-      "+---------+--------+-----------+---------+\n",
-      "| msgpack | 1.0.4  | 1.0.3     | 1.0.3   |\n",
-      "+---------+--------+-----------+---------+\n",
-      "Notes: \n",
-      "-  msgpack: Variation is ok, as long as everything is above 0.6\n",
-      "  warnings.warn(version_module.VersionMismatchWarning(msg[0][\"warning\"]))\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from dask.distributed import Client\n",
     "client = Client(cluster)"
@@ -684,8 +667,8 @@
    "source": [
     "This outputs a very large amount of data, including the listing itself as well as all the available reviews. The data is unwieldy and not in a format we can input to a regular machine learning model. \n",
     "\n",
-    "### Subset Data to Build a Machine Learning Pipeline\n",
-    "Let’s simplify this into a Machine Learning prediction problem. Let’s use the text in the Description field to predict the Review Rating and limit ourselves to listings of type “Apartment” only.\n",
+    "### Subset Data \n",
+    "Let’s use the text in the Description field to predict the Review Rating and limit ourselves to listings of type “Apartment” only.\n",
     "\n",
     "To do this, we’ll have to flatten the semi-structured JSON into a tabular form. We’ll use the processing function defined below to extract only the relevant information from all records. We'll then filter out only the Apartment listings, flatten the data structure and turn it into a Dask Dataframe."
    ]
@@ -1062,6 +1045,18 @@
   {
    "cell_type": "code",
    "execution_count": 20,
+   "id": "cd15b2cb-db7c-49e1-b383-b453d17485e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # run this cell if you've not used spacy in this env before\n",
+    "# # this will download the lexicon files\n",
+    "# ! python -m spacy download en"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
    "id": "848a8eab-92ff-4003-bc7f-1d5c9134ecff",
    "metadata": {},
    "outputs": [],
@@ -1083,7 +1078,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 22,
    "id": "b3079756-79de-4d5d-b835-831d305cb122",
    "metadata": {},
    "outputs": [],
@@ -1097,7 +1092,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 23,
    "id": "63378d46-eaaf-45e7-85a3-a234d12fdcba",
    "metadata": {},
    "outputs": [],
@@ -1110,7 +1105,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 24,
    "id": "97e8065f-401a-4201-bb6e-c3278009da12",
    "metadata": {},
    "outputs": [],
@@ -1121,7 +1116,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 25,
    "id": "5eb5efac-60c3-4816-9e50-ec3bef6f6fec",
    "metadata": {},
    "outputs": [
@@ -1189,7 +1184,7 @@
        "4  [clean, fully, furnish, spacious, 1, bedroom, ...            100"
       ]
      },
-     "execution_count": 24,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1198,305 +1193,164 @@
     "ddf.head()"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "99975628-6064-4115-a6c1-87716044fc03",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Vectorization with Dask-ML\n",
-    "We can now transform our lemmatized tokens into numerical representations (vectors) that can be input to a machine learning algorithm.\n",
-    "\n",
-    "To do this, we’ll first create our predictor and target features and our train and test splits:"
-   ]
-  },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "id": "d126152c-d608-4ccf-996a-e2073ee49bbb",
+   "execution_count": null,
+   "id": "b37cfc3f-e463-4873-9687-c29f7bf644ba",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "# insert b_flattened here to bypass custom tokenization\n",
-    "ddf = b_flattened.to_dataframe()\n",
-    "X = ddf['description'].to_dask_array(lengths=True)\n",
-    "y = ddf['review_rating'].to_dask_array(lengths=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "id": "151ca35d-74c1-4653-bc30-00b7b234f4cf",
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "ImportError",
-     "evalue": "cannot import name '_check_param_grid' from 'sklearn.model_selection._search' (/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/sklearn/model_selection/_search.py)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
-      "Input \u001b[0;32mIn [28]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mdask_ml\u001b[39;00m\n\u001b[1;32m      2\u001b[0m dask_ml\u001b[38;5;241m.\u001b[39m__version__\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/__init__.py:4\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpkg_resources\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DistributionNotFound, get_distribution\n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m# Ensure we always register tokenizers\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdask_ml\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _normalize\n\u001b[1;32m      6\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/model_selection/__init__.py:8\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_hyperband\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HyperbandSearchCV\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_incremental\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IncrementalSearchCV, InverseDecaySearchCV\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_search\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GridSearchCV, RandomizedSearchCV, check_cv, compute_n_splits\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_split\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m KFold, ShuffleSplit, train_test_split\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_successive_halving\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SuccessiveHalvingSearchCV\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/model_selection/_search.py:20\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     18\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseEstimator, MetaEstimatorMixin, clone, is_classifier\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexceptions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NotFittedError\n\u001b[0;32m---> 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_search\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseSearchCV, _check_param_grid\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_split\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m     22\u001b[0m     BaseShuffleSplit,\n\u001b[1;32m     23\u001b[0m     KFold,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     31\u001b[0m     _CVIterableWrapper,\n\u001b[1;32m     32\u001b[0m )\n\u001b[1;32m     33\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipeline\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FeatureUnion, Pipeline\n",
-      "\u001b[0;31mImportError\u001b[0m: cannot import name '_check_param_grid' from 'sklearn.model_selection._search' (/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/sklearn/model_selection/_search.py)"
-     ]
-    }
-   ],
-   "source": [
-    "import dask_ml\n",
-    "dask_ml.__version__"
-   ]
+   "source": []
   },
   {
-   "cell_type": "code",
-   "execution_count": 27,
-   "id": "4428342f-81db-4d4c-aea4-9d44f9efff22",
+   "cell_type": "markdown",
+   "id": "53e13148-8f9b-4363-89e7-0ddb8f32b904",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "ImportError",
-     "evalue": "cannot import name '_check_param_grid' from 'sklearn.model_selection._search' (/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/sklearn/model_selection/_search.py)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
-      "Input \u001b[0;32mIn [27]\u001b[0m, in \u001b[0;36m<cell line: 1>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdask_ml\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/__init__.py:4\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpkg_resources\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DistributionNotFound, get_distribution\n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m# Ensure we always register tokenizers\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdask_ml\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _normalize\n\u001b[1;32m      6\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/model_selection/__init__.py:8\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_hyperband\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HyperbandSearchCV\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_incremental\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IncrementalSearchCV, InverseDecaySearchCV\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_search\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GridSearchCV, RandomizedSearchCV, check_cv, compute_n_splits\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_split\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m KFold, ShuffleSplit, train_test_split\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_successive_halving\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SuccessiveHalvingSearchCV\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/model_selection/_search.py:20\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     18\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseEstimator, MetaEstimatorMixin, clone, is_classifier\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexceptions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NotFittedError\n\u001b[0;32m---> 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_search\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseSearchCV, _check_param_grid\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_split\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m     22\u001b[0m     BaseShuffleSplit,\n\u001b[1;32m     23\u001b[0m     KFold,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     31\u001b[0m     _CVIterableWrapper,\n\u001b[1;32m     32\u001b[0m )\n\u001b[1;32m     33\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipeline\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FeatureUnion, Pipeline\n",
-      "\u001b[0;31mImportError\u001b[0m: cannot import name '_check_param_grid' from 'sklearn.model_selection._search' (/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/sklearn/model_selection/_search.py)"
-     ]
-    }
-   ],
    "source": [
-    "from dask_ml.model_selection import train_test_split"
+    "## 5. Write Data Back to MongoDB\n",
+    "\n",
+    "Now that we have processed our raw JSON NLP data into a neat tabular format, we'll want to store this for future use. You can use the `to_mongo` method to write data to your MongoDB database.\n",
+    "\n",
+    "You'll first need to convert your Dask DataFrame to a Dask Bag:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 26,
-   "id": "222c57c9-c493-41b8-ac18-18e25b50c63f",
+   "id": "b5577f05-f8fa-4257-a443-f07ae285b792",
    "metadata": {},
    "outputs": [
     {
-     "ename": "ImportError",
-     "evalue": "cannot import name '_check_param_grid' from 'sklearn.model_selection._search' (/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/sklearn/model_selection/_search.py)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
-      "Input \u001b[0;32mIn [26]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# create train/test splits\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdask_ml\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m train_test_split\n\u001b[1;32m      3\u001b[0m X_train, X_test, y_train, y_test \u001b[38;5;241m=\u001b[39m train_test_split(\n\u001b[1;32m      4\u001b[0m     X, \n\u001b[1;32m      5\u001b[0m     y, \n\u001b[1;32m      6\u001b[0m     test_size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0.20\u001b[39m, \n\u001b[1;32m      7\u001b[0m     random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m40\u001b[39m,\n\u001b[1;32m      8\u001b[0m )\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/__init__.py:4\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpkg_resources\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m DistributionNotFound, get_distribution\n\u001b[1;32m      3\u001b[0m \u001b[38;5;66;03m# Ensure we always register tokenizers\u001b[39;00m\n\u001b[0;32m----> 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdask_ml\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _normalize\n\u001b[1;32m      6\u001b[0m __all__ \u001b[38;5;241m=\u001b[39m []\n\u001b[1;32m      8\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/model_selection/__init__.py:8\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_hyperband\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m HyperbandSearchCV\n\u001b[1;32m      7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_incremental\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m IncrementalSearchCV, InverseDecaySearchCV\n\u001b[0;32m----> 8\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_search\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m GridSearchCV, RandomizedSearchCV, check_cv, compute_n_splits\n\u001b[1;32m      9\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_split\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m KFold, ShuffleSplit, train_test_split\n\u001b[1;32m     10\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_successive_halving\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m SuccessiveHalvingSearchCV\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/dask_ml/model_selection/_search.py:20\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m     18\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mbase\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseEstimator, MetaEstimatorMixin, clone, is_classifier\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexceptions\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m NotFittedError\n\u001b[0;32m---> 20\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_search\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m BaseSearchCV, _check_param_grid\n\u001b[1;32m     21\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodel_selection\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01m_split\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m     22\u001b[0m     BaseShuffleSplit,\n\u001b[1;32m     23\u001b[0m     KFold,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     31\u001b[0m     _CVIterableWrapper,\n\u001b[1;32m     32\u001b[0m )\n\u001b[1;32m     33\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01msklearn\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpipeline\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m FeatureUnion, Pipeline\n",
-      "\u001b[0;31mImportError\u001b[0m: cannot import name '_check_param_grid' from 'sklearn.model_selection._search' (/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/sklearn/model_selection/_search.py)"
-     ]
+     "data": {
+      "text/plain": [
+       "({'description': ['here',\n",
+       "   'exist',\n",
+       "   'a',\n",
+       "   'very',\n",
+       "   'cozy',\n",
+       "   'room',\n",
+       "   'for',\n",
+       "   'rent',\n",
+       "   'in',\n",
+       "   'a',\n",
+       "   'share',\n",
+       "   '4',\n",
+       "   'bedroom',\n",
+       "   'apartment',\n",
+       "   'it',\n",
+       "   'be',\n",
+       "   'locate',\n",
+       "   'one',\n",
+       "   'block',\n",
+       "   'off',\n",
+       "   'of',\n",
+       "   'the',\n",
+       "   'jmz',\n",
+       "   'at',\n",
+       "   'myrtle',\n",
+       "   'broadway',\n",
+       "   'the',\n",
+       "   'neighborhood',\n",
+       "   'be',\n",
+       "   'diverse',\n",
+       "   'and',\n",
+       "   'appeal',\n",
+       "   'to',\n",
+       "   'a',\n",
+       "   'variety',\n",
+       "   'of',\n",
+       "   'people'],\n",
+       "  'review_rating': 100},)"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "# create train/test splits\n",
-    "from dask_ml.model_selection import train_test_split\n",
-    "X_train, X_test, y_train, y_test = train_test_split(\n",
-    "    X, \n",
-    "    y, \n",
-    "    test_size=0.20, \n",
-    "    random_state=40,\n",
-    ")"
+    "import dask.bag as db\n",
+    "\n",
+    "# convert dask dataframe back to Dask bag\n",
+    "new_bag = db.from_delayed(\n",
+    "    ddf.map_partitions(lambda x: x.to_dict(orient=\"records\")).to_delayed()\n",
+    ")\n",
+    "\n",
+    "new_bag.take(1)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "5e9c796c-252f-432a-8dac-489d45ee998d",
+   "id": "937cac09-2b12-4548-ab58-a46583179ccc",
    "metadata": {},
    "source": [
-    "And then pass these into the Dask-ML HashingVectorizer. "
+    "And then use `to_mongo` to write the data to MongoDB:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "92bc2fc9-cd96-415a-b088-9f9bcf479e00",
+   "id": "784ceae8-ce82-481b-a78f-c8b046863dfa",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# vectorize (NOTE: removed custom preprocessing bypasses)\n",
-    "from dask_ml.feature_extraction.text import HashingVectorizer\n",
-    "vect = HashingVectorizer()\n",
-    "X_train_vect = vect.fit_transform(X_train)"
+    "from dask_mongo import to_mongo\n",
+    "\n",
+    "to_mongo(\n",
+    "    new_bag,\n",
+    "    database=\"<your-database>\",\n",
+    "    collection=\"<your-collection>\",\n",
+    "    connection_kwargs={\"host\": host_uri},\n",
+    ")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "fbfd8c78-e5be-4880-9d03-c134378c938e",
-   "metadata": {},
-   "source": [
-    "Note that this Vectorizer comes with its own built-in preprocessing functions. We'll override these since we have already done our own customized preprocessing above."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5f6a755a-2721-4150-9723-e7a409a3e8e5",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# make sure to compute the chunk sizes before training\n",
-    "X_train_vect.compute_chunk_sizes()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d559a8b2-4888-45db-a33e-235091ea8699",
+   "id": "82f59260-c06e-419f-881b-ae5bd6e1f24c",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "X_train_vect = X_train_vect.persist()"
+    "*Note that because the AirBnB sample data is located on a low-resource Free Tier cluster, the read/write speeds may be suboptimal. Consider upgrading your cluster to improve your query performance.*"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "1a44d89a-2dc1-49ed-bfd7-6d2062b0d773",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## ML Classification with XGBoost\n",
-    "Now we're all set to feed our data into the XGBoost Classifier and run our predictions. We are predicting the Review Rating based on the text in the Description field."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "id": "2dfab100-29f6-4809-8f2f-fa43b6cf9b15",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n",
-      "  from pandas import MultiIndex, Int64Index\n"
-     ]
-    }
-   ],
-   "source": [
-    "import xgboost as xgb\n",
-    "clf = xgb.dask.DaskXGBClassifier()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "id": "db1bca5a-4bf7-4054-9263-d89bfb61bc78",
+   "id": "f634c3c6-a035-4544-aa1e-cfbda6026af9",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "KeyboardInterrupt",
-     "evalue": "",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
-      "File \u001b[0;32m<timed eval>:2\u001b[0m, in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/xgboost/dask.py:1817\u001b[0m, in \u001b[0;36mDaskXGBClassifier.fit\u001b[0;34m(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights, callbacks)\u001b[0m\n\u001b[1;32m   1815\u001b[0m _assert_dask_support()\n\u001b[1;32m   1816\u001b[0m args \u001b[38;5;241m=\u001b[39m {k: v \u001b[38;5;28;01mfor\u001b[39;00m k, v \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mlocals\u001b[39m()\u001b[38;5;241m.\u001b[39mitems() \u001b[38;5;28;01mif\u001b[39;00m k \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m (\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mself\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m__class__\u001b[39m\u001b[38;5;124m\"\u001b[39m)}\n\u001b[0;32m-> 1817\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_client_sync\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fit_async\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/xgboost/dask.py:1623\u001b[0m, in \u001b[0;36mDaskScikitLearnBase._client_sync\u001b[0;34m(self, func, **kwargs)\u001b[0m\n\u001b[1;32m   1620\u001b[0m                 \u001b[38;5;28;01mreturn\u001b[39;00m ret\n\u001b[1;32m   1621\u001b[0m             \u001b[38;5;28;01mreturn\u001b[39;00m ret\n\u001b[0;32m-> 1623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mclient\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msync\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43masynchronous\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43masynchronous\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/distributed/utils.py:309\u001b[0m, in \u001b[0;36mSyncMethodMixin.sync\u001b[0;34m(self, func, asynchronous, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m    307\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m future\n\u001b[1;32m    308\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 309\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43msync\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    310\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mloop\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfunc\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcallback_timeout\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcallback_timeout\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\n\u001b[1;32m    311\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/distributed/utils.py:372\u001b[0m, in \u001b[0;36msync\u001b[0;34m(loop, func, callback_timeout, *args, **kwargs)\u001b[0m\n\u001b[1;32m    370\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    371\u001b[0m     \u001b[38;5;28;01mwhile\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m e\u001b[38;5;241m.\u001b[39mis_set():\n\u001b[0;32m--> 372\u001b[0m         \u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m10\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m    374\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m error:\n\u001b[1;32m    375\u001b[0m     typ, exc, tb \u001b[38;5;241m=\u001b[39m error\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/distributed/utils.py:361\u001b[0m, in \u001b[0;36msync.<locals>.wait\u001b[0;34m(timeout)\u001b[0m\n\u001b[1;32m    359\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwait\u001b[39m(timeout):\n\u001b[1;32m    360\u001b[0m     \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m--> 361\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43me\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    362\u001b[0m     \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mKeyboardInterrupt\u001b[39;00m:\n\u001b[1;32m    363\u001b[0m         loop\u001b[38;5;241m.\u001b[39madd_callback(cancel)\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/threading.py:574\u001b[0m, in \u001b[0;36mEvent.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    572\u001b[0m signaled \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_flag\n\u001b[1;32m    573\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m signaled:\n\u001b[0;32m--> 574\u001b[0m     signaled \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_cond\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mwait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    575\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m signaled\n",
-      "File \u001b[0;32m~/mambaforge/envs/dask-mongo/lib/python3.9/threading.py:316\u001b[0m, in \u001b[0;36mCondition.wait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m    314\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    315\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m timeout \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m--> 316\u001b[0m         gotit \u001b[38;5;241m=\u001b[39m \u001b[43mwaiter\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43macquire\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtimeout\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    317\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m    318\u001b[0m         gotit \u001b[38;5;241m=\u001b[39m waiter\u001b[38;5;241m.\u001b[39macquire(\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
-      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client\n",
-      "distributed.deploy.cluster - WARNING - Failed to sync cluster info multiple times - perhaps there's a connection issue? Error:\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/distributed/comm/tcp.py\", line 426, in connect\n",
-      "    stream = await self.client.connect(\n",
-      "  File \"/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/tornado/tcpclient.py\", line 275, in connect\n",
-      "    af, addr, stream = await connector.start(connect_timeout=timeout)\n",
-      "asyncio.exceptions.CancelledError\n",
-      "\n",
-      "During handling of the above exception, another exception occurred:\n",
-      "\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/asyncio/tasks.py\", line 490, in wait_for\n",
-      "    return fut.result()\n",
-      "asyncio.exceptions.CancelledError\n",
-      "\n",
-      "The above exception was the direct cause of the following exception:\n",
-      "\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/distributed/comm/core.py\", line 289, in connect\n",
-      "    comm = await asyncio.wait_for(\n",
-      "  File \"/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/asyncio/tasks.py\", line 492, in wait_for\n",
-      "    raise exceptions.TimeoutError() from exc\n",
-      "asyncio.exceptions.TimeoutError\n",
-      "\n",
-      "The above exception was the direct cause of the following exception:\n",
-      "\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/distributed/deploy/cluster.py\", line 131, in _sync_cluster_info\n",
-      "    await self.scheduler_comm.set_metadata(\n",
-      "  File \"/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/distributed/core.py\", line 827, in send_recv_from_rpc\n",
-      "    comm = await self.live_comm()\n",
-      "  File \"/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/distributed/core.py\", line 784, in live_comm\n",
-      "    comm = await connect(\n",
-      "  File \"/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/distributed/comm/core.py\", line 315, in connect\n",
-      "    raise OSError(\n",
-      "OSError: Timed out trying to connect to tls://3.80.159.200:8786 after 30 s\n"
-     ]
-    }
-   ],
    "source": [
-    "%%time\n",
-    "# train classifier\n",
-    "clf.fit(X_train_vect, y_train)"
+    "## Summary\n",
+    "- You can use the dask-mongo connector to read and write files in parallel between your local Python session and a MongoDB Atlas cluster.\n",
+    "- You can use Dask and MongoDB to build an NLP pipeline at scale.\n",
+    "- You can run Dask computations in the cloud using a Coiled cluster. Running Coiled and MongoDB together can lead to performance gains on your queries and analyses.\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "313679f9-1172-4325-9ce8-fcf1daeff104",
+   "id": "0d0b0e9d-9bbf-4724-a629-9db8d7aed9ce",
    "metadata": {},
    "source": [
-    "**NOTE:** this is taking WAY too long to start running."
+    "## Get Started with dask-mongo\n",
+    "To get started with the dask-mongo connector, install it on your local machine using pip or conda.\n",
+    "\n",
+    "`pip install dask-mongo`\n",
+    "\n",
+    "`conda install dask-mongo -c conda-forge`\n",
+    "\n",
+    "You can then run the code in the accompanying notebook yourself to test-drive dask-mongo. For more information, we recommend taking a look at [the Coiled documentation](https://docs.coiled.io/user_guide/examples/mongodb.html). \n",
+    "\n",
+    "Let us know how you get on by tweeting to the developing team at [@CoiledHQ](https://twitter.com/CoiledHQ)!"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 49,
-   "id": "18cb7b93-6a76-434b-a071-4e7ada9dc90c",
+   "execution_count": null,
+   "id": "70ddaa3a-0012-445d-ba7f-512df3c58369",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'xgb' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Input \u001b[0;32mIn [49]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# predict probabilities\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m proba \u001b[38;5;241m=\u001b[39m \u001b[43mxgb\u001b[49m\u001b[38;5;241m.\u001b[39mpredict_proba(X_test)\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'xgb' is not defined"
-     ]
-    }
-   ],
-   "source": [
-    "# predict probabilities\n",
-    "proba = xgb.predict_proba(X_test)"
-   ]
+   "outputs": [],
+   "source": []
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "a591fcf0-776b-4aaf-b609-60861b8b2b6a",
+   "id": "bffebbdf-1c9e-4f64-a14f-cac827e1a7c3",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -1504,25 +1358,15 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "17b57714-f31c-4090-8517-16bf84e6a5bf",
+   "id": "0f509099-716e-406a-ab0d-e158ad762977",
    "metadata": {},
    "outputs": [],
    "source": []
   },
-  {
-   "cell_type": "markdown",
-   "id": "23b900b3-6b48-4fb6-b816-077360aeda8e",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Write Back to MongoDB"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f1501521-29d0-40d5-81c0-df38ceaedcb5",
+   "id": "ea3adf7e-69ac-47d2-b96f-280132f37064",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -1530,74 +1374,621 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "375da03a-e1c3-4a0b-9922-97b086a2b198",
+   "id": "d305faeb-35f2-42b6-a6f8-4805a1c066da",
    "metadata": {},
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "markdown",
-   "id": "94d9e401-cc0c-4e15-8e92-566dfeb2bd66",
+   "id": "eed82700-2a37-4cd9-8cff-14a6f83372be",
    "metadata": {
     "tags": []
    },
    "source": [
-    "## How Does it Work\n",
-    "The dask-mongo connector connects to the pymongo driver and..."
+    "# OLD STUFF"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "c5e3db2d-1408-4f0d-99c4-17837e252dbb",
+   "id": "e1e5ca48-73f1-447c-b57e-4b24d09cc4cd",
    "metadata": {},
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "markdown",
-   "id": "dc10efe7-b72f-4a6a-b824-f6f2e69c0728",
+   "id": "99975628-6064-4115-a6c1-87716044fc03",
    "metadata": {
     "tags": []
    },
    "source": [
-    "## Conclusion"
+    "## Vectorization with Dask-ML\n",
+    "We can now transform our lemmatized tokens into numerical representations (vectors) that can be input to a machine learning algorithm.\n",
+    "\n",
+    "To do this, we’ll first create our predictor and target features and our train and test splits:"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "789a5231-20c7-4527-9cad-9339c5f4c285",
+   "execution_count": 26,
+   "id": "d126152c-d608-4ccf-996a-e2073ee49bbb",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# insert b_flattened here to bypass custom tokenization\n",
+    "ddf = b_flattened.to_dataframe()\n",
+    "# X = ddf['description']\n",
+    "# y = ddf['review_rating']\n",
+    "X = ddf['description'].to_dask_array(lengths=True)\n",
+    "y = ddf['review_rating'].to_dask_array(lengths=True)"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "345ed7dc-fd60-4a5a-8867-65e7dddb377e",
+   "execution_count": 27,
+   "id": "4428342f-81db-4d4c-aea4-9d44f9efff22",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "from dask_ml.model_selection import train_test_split"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "aac33f8f-f709-49cf-bc98-7e4a0d3c7dbd",
+   "execution_count": 28,
+   "id": "222c57c9-c493-41b8-ac18-18e25b50c63f",
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "# create train/test splits\n",
+    "from dask_ml.model_selection import train_test_split\n",
+    "X_train, X_test, y_train, y_test = train_test_split(\n",
+    "    X, \n",
+    "    y, \n",
+    "    test_size=0.20, \n",
+    "    random_state=40,\n",
+    ")"
+   ]
   },
   {
    "cell_type": "markdown",
-   "id": "eed82700-2a37-4cd9-8cff-14a6f83372be",
-   "metadata": {
-    "tags": []
-   },
+   "id": "5e9c796c-252f-432a-8dac-489d45ee998d",
+   "metadata": {},
    "source": [
-    "# OLD STUFF"
+    "And then pass these into the Dask-ML HashingVectorizer. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "92bc2fc9-cd96-415a-b088-9f9bcf479e00",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# vectorize \n",
+    "from dask_ml.feature_extraction.text import HashingVectorizer\n",
+    "vect = HashingVectorizer()\n",
+    "X_train_vect = vect.fit_transform(X_train)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fbfd8c78-e5be-4880-9d03-c134378c938e",
+   "metadata": {},
+   "source": [
+    "Note that this Vectorizer comes with its own built-in preprocessing functions. We could override these since we have already done our own customized preprocessing above."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "5f6a755a-2721-4150-9723-e7a409a3e8e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <td>\n",
+       "            <table>\n",
+       "                <thead>\n",
+       "                    <tr>\n",
+       "                        <td> </td>\n",
+       "                        <th> Array </th>\n",
+       "                        <th> Chunk </th>\n",
+       "                    </tr>\n",
+       "                </thead>\n",
+       "                <tbody>\n",
+       "                    \n",
+       "                    <tr>\n",
+       "                        <th> Shape </th>\n",
+       "                        <td> (2139, 1048576) </td>\n",
+       "                        <td> (211, 1048576) </td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                        <th> Count </th>\n",
+       "                        <td> 108 Tasks </td>\n",
+       "                        <td> 12 Chunks </td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                    <th> Type </th>\n",
+       "                    <td> float64 </td>\n",
+       "                    <td> scipy.sparse.csr.csr_matrix </td>\n",
+       "                    </tr>\n",
+       "                </tbody>\n",
+       "            </table>\n",
+       "        </td>\n",
+       "        <td>\n",
+       "        <svg width=\"170\" height=\"75\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
+       "\n",
+       "  <!-- Horizontal lines -->\n",
+       "  <line x1=\"0\" y1=\"0\" x2=\"120\" y2=\"0\" style=\"stroke-width:2\" />\n",
+       "  <line x1=\"0\" y1=\"2\" x2=\"120\" y2=\"2\" />\n",
+       "  <line x1=\"0\" y1=\"4\" x2=\"120\" y2=\"4\" />\n",
+       "  <line x1=\"0\" y1=\"6\" x2=\"120\" y2=\"6\" />\n",
+       "  <line x1=\"0\" y1=\"9\" x2=\"120\" y2=\"9\" />\n",
+       "  <line x1=\"0\" y1=\"11\" x2=\"120\" y2=\"11\" />\n",
+       "  <line x1=\"0\" y1=\"13\" x2=\"120\" y2=\"13\" />\n",
+       "  <line x1=\"0\" y1=\"15\" x2=\"120\" y2=\"15\" />\n",
+       "  <line x1=\"0\" y1=\"17\" x2=\"120\" y2=\"17\" />\n",
+       "  <line x1=\"0\" y1=\"19\" x2=\"120\" y2=\"19\" />\n",
+       "  <line x1=\"0\" y1=\"20\" x2=\"120\" y2=\"20\" />\n",
+       "  <line x1=\"0\" y1=\"23\" x2=\"120\" y2=\"23\" />\n",
+       "  <line x1=\"0\" y1=\"25\" x2=\"120\" y2=\"25\" style=\"stroke-width:2\" />\n",
+       "\n",
+       "  <!-- Vertical lines -->\n",
+       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"25\" style=\"stroke-width:2\" />\n",
+       "  <line x1=\"120\" y1=\"0\" x2=\"120\" y2=\"25\" style=\"stroke-width:2\" />\n",
+       "\n",
+       "  <!-- Colored Rectangle -->\n",
+       "  <polygon points=\"0.0,0.0 120.0,0.0 120.0,25.412616514582485 0.0,25.412616514582485\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
+       "\n",
+       "  <!-- Text -->\n",
+       "  <text x=\"60.000000\" y=\"45.412617\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >1048576</text>\n",
+       "  <text x=\"140.000000\" y=\"12.706308\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,140.000000,12.706308)\">2139</text>\n",
+       "</svg>\n",
+       "        </td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "dask.array<_transformer, shape=(2139, 1048576), dtype=float64, chunksize=(211, 1048576), chunktype=scipy.csr_matrix>"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# make sure to compute the chunk sizes before training\n",
+    "X_train_vect.compute_chunk_sizes()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "3c67234c-75c3-4491-ab40-e13a093367a8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <td>\n",
+       "            <table>\n",
+       "                <thead>\n",
+       "                    <tr>\n",
+       "                        <td> </td>\n",
+       "                        <th> Array </th>\n",
+       "                        <th> Chunk </th>\n",
+       "                    </tr>\n",
+       "                </thead>\n",
+       "                <tbody>\n",
+       "                    \n",
+       "                    <tr>\n",
+       "                        <th> Bytes </th>\n",
+       "                        <td> 16.71 kiB </td>\n",
+       "                        <td> 1.65 kiB </td>\n",
+       "                    </tr>\n",
+       "                    \n",
+       "                    <tr>\n",
+       "                        <th> Shape </th>\n",
+       "                        <td> (2139,) </td>\n",
+       "                        <td> (211,) </td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                        <th> Count </th>\n",
+       "                        <td> 96 Tasks </td>\n",
+       "                        <td> 12 Chunks </td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                    <th> Type </th>\n",
+       "                    <td> int64 </td>\n",
+       "                    <td> numpy.ndarray </td>\n",
+       "                    </tr>\n",
+       "                </tbody>\n",
+       "            </table>\n",
+       "        </td>\n",
+       "        <td>\n",
+       "        <svg width=\"170\" height=\"75\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
+       "\n",
+       "  <!-- Horizontal lines -->\n",
+       "  <line x1=\"0\" y1=\"0\" x2=\"120\" y2=\"0\" style=\"stroke-width:2\" />\n",
+       "  <line x1=\"0\" y1=\"25\" x2=\"120\" y2=\"25\" style=\"stroke-width:2\" />\n",
+       "\n",
+       "  <!-- Vertical lines -->\n",
+       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"25\" style=\"stroke-width:2\" />\n",
+       "  <line x1=\"9\" y1=\"0\" x2=\"9\" y2=\"25\" />\n",
+       "  <line x1=\"20\" y1=\"0\" x2=\"20\" y2=\"25\" />\n",
+       "  <line x1=\"31\" y1=\"0\" x2=\"31\" y2=\"25\" />\n",
+       "  <line x1=\"43\" y1=\"0\" x2=\"43\" y2=\"25\" />\n",
+       "  <line x1=\"53\" y1=\"0\" x2=\"53\" y2=\"25\" />\n",
+       "  <line x1=\"64\" y1=\"0\" x2=\"64\" y2=\"25\" />\n",
+       "  <line x1=\"73\" y1=\"0\" x2=\"73\" y2=\"25\" />\n",
+       "  <line x1=\"82\" y1=\"0\" x2=\"82\" y2=\"25\" />\n",
+       "  <line x1=\"90\" y1=\"0\" x2=\"90\" y2=\"25\" />\n",
+       "  <line x1=\"98\" y1=\"0\" x2=\"98\" y2=\"25\" />\n",
+       "  <line x1=\"109\" y1=\"0\" x2=\"109\" y2=\"25\" />\n",
+       "  <line x1=\"120\" y1=\"0\" x2=\"120\" y2=\"25\" style=\"stroke-width:2\" />\n",
+       "\n",
+       "  <!-- Colored Rectangle -->\n",
+       "  <polygon points=\"0.0,0.0 120.0,0.0 120.0,25.412616514582485 0.0,25.412616514582485\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
+       "\n",
+       "  <!-- Text -->\n",
+       "  <text x=\"60.000000\" y=\"45.412617\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >2139</text>\n",
+       "  <text x=\"140.000000\" y=\"12.706308\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(0,140.000000,12.706308)\">1</text>\n",
+       "</svg>\n",
+       "        </td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "dask.array<concatenate, shape=(2139,), dtype=int64, chunksize=(211,), chunktype=numpy.ndarray>"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# omit?\n",
+    "y_train.compute_chunk_sizes()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "c4c76ce9-a2c0-435f-9833-08c58d061e5b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# omit?\n",
+    "y_train = y_train.reshape(2139,1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "2e022881-65bd-4936-868e-03e4be6d8d6f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<table>\n",
+       "    <tr>\n",
+       "        <td>\n",
+       "            <table>\n",
+       "                <thead>\n",
+       "                    <tr>\n",
+       "                        <td> </td>\n",
+       "                        <th> Array </th>\n",
+       "                        <th> Chunk </th>\n",
+       "                    </tr>\n",
+       "                </thead>\n",
+       "                <tbody>\n",
+       "                    \n",
+       "                    <tr>\n",
+       "                        <th> Bytes </th>\n",
+       "                        <td> 16.71 kiB </td>\n",
+       "                        <td> 1.65 kiB </td>\n",
+       "                    </tr>\n",
+       "                    \n",
+       "                    <tr>\n",
+       "                        <th> Shape </th>\n",
+       "                        <td> (2139, 1) </td>\n",
+       "                        <td> (211, 1) </td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                        <th> Count </th>\n",
+       "                        <td> 108 Tasks </td>\n",
+       "                        <td> 12 Chunks </td>\n",
+       "                    </tr>\n",
+       "                    <tr>\n",
+       "                    <th> Type </th>\n",
+       "                    <td> int64 </td>\n",
+       "                    <td> numpy.ndarray </td>\n",
+       "                    </tr>\n",
+       "                </tbody>\n",
+       "            </table>\n",
+       "        </td>\n",
+       "        <td>\n",
+       "        <svg width=\"75\" height=\"170\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n",
+       "\n",
+       "  <!-- Horizontal lines -->\n",
+       "  <line x1=\"0\" y1=\"0\" x2=\"25\" y2=\"0\" style=\"stroke-width:2\" />\n",
+       "  <line x1=\"0\" y1=\"9\" x2=\"25\" y2=\"9\" />\n",
+       "  <line x1=\"0\" y1=\"20\" x2=\"25\" y2=\"20\" />\n",
+       "  <line x1=\"0\" y1=\"31\" x2=\"25\" y2=\"31\" />\n",
+       "  <line x1=\"0\" y1=\"43\" x2=\"25\" y2=\"43\" />\n",
+       "  <line x1=\"0\" y1=\"53\" x2=\"25\" y2=\"53\" />\n",
+       "  <line x1=\"0\" y1=\"64\" x2=\"25\" y2=\"64\" />\n",
+       "  <line x1=\"0\" y1=\"73\" x2=\"25\" y2=\"73\" />\n",
+       "  <line x1=\"0\" y1=\"82\" x2=\"25\" y2=\"82\" />\n",
+       "  <line x1=\"0\" y1=\"90\" x2=\"25\" y2=\"90\" />\n",
+       "  <line x1=\"0\" y1=\"98\" x2=\"25\" y2=\"98\" />\n",
+       "  <line x1=\"0\" y1=\"109\" x2=\"25\" y2=\"109\" />\n",
+       "  <line x1=\"0\" y1=\"120\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n",
+       "\n",
+       "  <!-- Vertical lines -->\n",
+       "  <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"120\" style=\"stroke-width:2\" />\n",
+       "  <line x1=\"25\" y1=\"0\" x2=\"25\" y2=\"120\" style=\"stroke-width:2\" />\n",
+       "\n",
+       "  <!-- Colored Rectangle -->\n",
+       "  <polygon points=\"0.0,0.0 25.412616514582485,0.0 25.412616514582485,120.0 0.0,120.0\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n",
+       "\n",
+       "  <!-- Text -->\n",
+       "  <text x=\"12.706308\" y=\"140.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >1</text>\n",
+       "  <text x=\"45.412617\" y=\"60.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,45.412617,60.000000)\">2139</text>\n",
+       "</svg>\n",
+       "        </td>\n",
+       "    </tr>\n",
+       "</table>"
+      ],
+      "text/plain": [
+       "dask.array<reshape, shape=(2139, 1), dtype=int64, chunksize=(211, 1), chunktype=numpy.ndarray>"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# omit?\n",
+    "y_train"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "d559a8b2-4888-45db-a33e-235091ea8699",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X_train_vect = X_train_vect.persist()\n",
+    "y_train = y_train.persist()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97e08030-9d46-439c-830c-8aa34f43d6e1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f5aab69-4788-41a8-b588-5c288b3dc757",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fadeb7c-efca-4978-ac13-6606049e90b5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fdce62a6-4bd7-4719-aa4d-2debc1adcde3",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a44d89a-2dc1-49ed-bfd7-6d2062b0d773",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## Machine Learning Prediction with XGBoost\n",
+    "Now we're all set to feed our data into our XGBoost model and run our predictions. We are predicting the Review Rating based on the text in the Description field."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "1d9c5b43-1996-4f1e-9e63-d6238c5a69f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## NOTE\n",
+    "## Let's try to turn the vectors into dataframes and see if that solves the `divisions not found` \n",
+    "## error we're getting with XGBoost.\n",
+    "\n",
+    "## casting `X_train_vect` `to_dask_dataframe()` doesn't work because of mismatched shapes\n",
+    "## creating it as a dataframe with `dd.from_dask_array()` doesn't work for the same reason\n",
+    "## ValueError: Shape of passed values is (176, 1), indices imply (176, 1048576)\n",
+    "\n",
+    "## The XGBoost error is due to the fact that it does not support the dask sparse matrix output of \n",
+    "## HashingVectorizer\n",
+    "## See issue here: https://github.com/dmlc/xgboost/issues/7454"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f72fd391-8037-42ca-944b-b3123ceb0bf5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "655bf2f2-0ecf-4d81-94b9-611b7c5196d4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "08fc7696-a7eb-40b8-9406-26f5c970d7ce",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n",
+      "  from pandas import MultiIndex, Int64Index\n"
+     ]
+    }
+   ],
+   "source": [
+    "import xgboost as xgb\n",
+    "reg = xgb.dask.DaskXGBRegressor()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "81e5d303-8aa3-4256-9252-617f17260d0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "reg.fit(X_train_vect, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f58690e-f4d3-4fab-8ee3-18d88660e6b9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2a5a8a70-493b-4929-9906-4445e564929b",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c88906d-f9b0-42cc-9998-e55da7c1a0f1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "id": "2dfab100-29f6-4809-8f2f-fa43b6cf9b15",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n",
+      "  from pandas import MultiIndex, Int64Index\n"
+     ]
+    }
+   ],
+   "source": [
+    "import xgboost as xgb\n",
+    "clf = xgb.dask.DaskXGBClassifier()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db1bca5a-4bf7-4054-9263-d89bfb61bc78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%time\n",
+    "# train classifier\n",
+    "clf.fit(X_train_vect, y_train)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "id": "18cb7b93-6a76-434b-a071-4e7ada9dc90c",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'xgb' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Input \u001b[0;32mIn [49]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# predict probabilities\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m proba \u001b[38;5;241m=\u001b[39m \u001b[43mxgb\u001b[49m\u001b[38;5;241m.\u001b[39mpredict_proba(X_test)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'xgb' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "# predict probabilities\n",
+    "proba = xgb.predict_proba(X_test)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a591fcf0-776b-4aaf-b609-60861b8b2b6a",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17b57714-f31c-4090-8517-16bf84e6a5bf",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aac33f8f-f709-49cf-bc98-7e4a0d3c7dbd",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
   {
    "cell_type": "markdown",
    "id": "62970030-962c-4a95-9898-85801dd3d78a",

From b6ffc2312b3fe127dd277563aeb9c0d8af54cd07 Mon Sep 17 00:00:00 2001
From: Richard Pelgrim <68642378+rrpelgrim@users.noreply.github.com>
Date: Tue, 14 Jun 2022 15:37:33 +0200
Subject: [PATCH 3/5] Update environment.yml

---
 mongodb-with-coiled/environment.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mongodb-with-coiled/environment.yml b/mongodb-with-coiled/environment.yml
index 36bd739..d156717 100644
--- a/mongodb-with-coiled/environment.yml
+++ b/mongodb-with-coiled/environment.yml
@@ -23,6 +23,7 @@ dependencies:
   - gensim[distributed]
   - dask-ml
   - xgboost==1.5.1
+  - cloudpickle==2.1.0
   - pip
   - pip:
     - pymongo[srv]

From 486e7f4ee4ffd129769e8f9b1e07fb4fbdc80f4d Mon Sep 17 00:00:00 2001
From: Richard Pelgrim <68642378+rrpelgrim@users.noreply.github.com>
Date: Tue, 14 Jun 2022 15:40:42 +0200
Subject: [PATCH 4/5] Run code again - works

---
 mongodb-with-coiled/dask-mongo.ipynb | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mongodb-with-coiled/dask-mongo.ipynb b/mongodb-with-coiled/dask-mongo.ipynb
index f358988..9f86872 100644
--- a/mongodb-with-coiled/dask-mongo.ipynb
+++ b/mongodb-with-coiled/dask-mongo.ipynb
@@ -76,7 +76,7 @@
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "55ae007800324f6996b66a699a648d08",
+       "model_id": "3f4a93d1d7864512998f389eb2818cf2",
        "version_major": 2,
        "version_minor": 0
       },

From 9e76e201b7a3a31400552122b23cb47ab0d9372e Mon Sep 17 00:00:00 2001
From: Richard Pelgrim <68642378+rrpelgrim@users.noreply.github.com>
Date: Tue, 14 Jun 2022 15:42:05 +0200
Subject: [PATCH 5/5] Delete old ML code that doesn't work

because of xgboost error (divisions not known)
---
 mongodb-with-coiled/dask-mongo.ipynb | 421 ---------------------------
 1 file changed, 421 deletions(-)

diff --git a/mongodb-with-coiled/dask-mongo.ipynb b/mongodb-with-coiled/dask-mongo.ipynb
index 9f86872..f6c58ad 100644
--- a/mongodb-with-coiled/dask-mongo.ipynb
+++ b/mongodb-with-coiled/dask-mongo.ipynb
@@ -1363,68 +1363,6 @@
    "outputs": [],
    "source": []
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ea3adf7e-69ac-47d2-b96f-280132f37064",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d305faeb-35f2-42b6-a6f8-4805a1c066da",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "eed82700-2a37-4cd9-8cff-14a6f83372be",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "# OLD STUFF"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e1e5ca48-73f1-447c-b57e-4b24d09cc4cd",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "99975628-6064-4115-a6c1-87716044fc03",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Vectorization with Dask-ML\n",
-    "We can now transform our lemmatized tokens into numerical representations (vectors) that can be input to a machine learning algorithm.\n",
-    "\n",
-    "To do this, we’ll first create our predictor and target features and our train and test splits:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "d126152c-d608-4ccf-996a-e2073ee49bbb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# insert b_flattened here to bypass custom tokenization\n",
-    "ddf = b_flattened.to_dataframe()\n",
-    "# X = ddf['description']\n",
-    "# y = ddf['review_rating']\n",
-    "X = ddf['description'].to_dask_array(lengths=True)\n",
-    "y = ddf['review_rating'].to_dask_array(lengths=True)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 27,
@@ -1808,344 +1746,6 @@
    "outputs": [],
    "source": []
   },
-  {
-   "cell_type": "markdown",
-   "id": "1a44d89a-2dc1-49ed-bfd7-6d2062b0d773",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## Machine Learning Prediction with XGBoost\n",
-    "Now we're all set to feed our data into our XGBoost model and run our predictions. We are predicting the Review Rating based on the text in the Description field."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 45,
-   "id": "1d9c5b43-1996-4f1e-9e63-d6238c5a69f8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "## NOTE\n",
-    "## Let's try to turn the vectors into dataframes and see if that solves the `divisions not found` \n",
-    "## error we're getting with XGBoost.\n",
-    "\n",
-    "## casting `X_train_vect` `to_dask_dataframe()` doesn't work because of mismatched shapes\n",
-    "## creating it as a dataframe with `dd.from_dask_array()` doesn't work for the same reason\n",
-    "## ValueError: Shape of passed values is (176, 1), indices imply (176, 1048576)\n",
-    "\n",
-    "## The XGBoost error is due to the fact that it does not support the dask sparse matrix output of \n",
-    "## HashingVectorizer\n",
-    "## See issue here: https://github.com/dmlc/xgboost/issues/7454"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f72fd391-8037-42ca-944b-b3123ceb0bf5",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "655bf2f2-0ecf-4d81-94b9-611b7c5196d4",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "id": "08fc7696-a7eb-40b8-9406-26f5c970d7ce",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n",
-      "  from pandas import MultiIndex, Int64Index\n"
-     ]
-    }
-   ],
-   "source": [
-    "import xgboost as xgb\n",
-    "reg = xgb.dask.DaskXGBRegressor()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "81e5d303-8aa3-4256-9252-617f17260d0c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%time\n",
-    "reg.fit(X_train_vect, y_train)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2f58690e-f4d3-4fab-8ee3-18d88660e6b9",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2a5a8a70-493b-4929-9906-4445e564929b",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2c88906d-f9b0-42cc-9998-e55da7c1a0f1",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "id": "2dfab100-29f6-4809-8f2f-fa43b6cf9b15",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/rpelgrim/mambaforge/envs/dask-mongo/lib/python3.9/site-packages/xgboost/compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.\n",
-      "  from pandas import MultiIndex, Int64Index\n"
-     ]
-    }
-   ],
-   "source": [
-    "import xgboost as xgb\n",
-    "clf = xgb.dask.DaskXGBClassifier()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "db1bca5a-4bf7-4054-9263-d89bfb61bc78",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%%time\n",
-    "# train classifier\n",
-    "clf.fit(X_train_vect, y_train)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 49,
-   "id": "18cb7b93-6a76-434b-a071-4e7ada9dc90c",
-   "metadata": {},
-   "outputs": [
-    {
-     "ename": "NameError",
-     "evalue": "name 'xgb' is not defined",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
-      "Input \u001b[0;32mIn [49]\u001b[0m, in \u001b[0;36m<cell line: 2>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;66;03m# predict probabilities\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m proba \u001b[38;5;241m=\u001b[39m \u001b[43mxgb\u001b[49m\u001b[38;5;241m.\u001b[39mpredict_proba(X_test)\n",
-      "\u001b[0;31mNameError\u001b[0m: name 'xgb' is not defined"
-     ]
-    }
-   ],
-   "source": [
-    "# predict probabilities\n",
-    "proba = xgb.predict_proba(X_test)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a591fcf0-776b-4aaf-b609-60861b8b2b6a",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "17b57714-f31c-4090-8517-16bf84e6a5bf",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "aac33f8f-f709-49cf-bc98-7e4a0d3c7dbd",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "markdown",
-   "id": "62970030-962c-4a95-9898-85801dd3d78a",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true,
-    "tags": []
-   },
-   "source": [
-    "### Create train/test split"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "803c7850-964c-4406-80e4-e62e8524ebbd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from dask_ml.model_selection import train_test_split"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ebd3927f-045c-4085-b0fa-4489c547ac4e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X = ddf['description'].to_dask_array(lengths=True)\n",
-    "y = ddf['review_rating'].to_dask_array(lengths=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0ea3b799-f210-40d8-9092-893f3701f420",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X_train, X_test, y_train, y_test = train_test_split(\n",
-    "    X, \n",
-    "    y, \n",
-    "    test_size=0.20, \n",
-    "    random_state=40\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1398654f-7fa8-4e76-a1ae-1fb1ef1cb4de",
-   "metadata": {
-    "jp-MarkdownHeadingCollapsed": true,
-    "tags": []
-   },
-   "source": [
-    "### Vectorize"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "dae1e939-af01-453b-b329-874b23ebe18b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from dask_ml.feature_extraction.text import HashingVectorizer"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1cbb872e-7b3b-46de-a588-dd054b56c21d",
-   "metadata": {},
-   "source": [
-    "HashingVectorizer has some built-in tokenization and preprocessing capabilities we could explore.\n",
-    "\n",
-    "We'll just use it out-of-the-box for now."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "81fa8ee5-8a8e-439a-b3b0-8f75c2353f82",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "vect = HashingVectorizer()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "29d35d96-9d6c-4031-a264-dc34596ce37d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "X_train_vect = vect.fit_transform(X_train)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "id": "bb19e54e-14f2-4c54-ac40-8b6242a7d1be",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<table>\n",
-       "    <tr>\n",
-       "        <td>\n",
-       "            <table>\n",
-       "                <thead>\n",
-       "                    <tr>\n",
-       "                        <td> </td>\n",
-       "                        <th> Array </th>\n",
-       "                        <th> Chunk </th>\n",
-       "                    </tr>\n",
-       "                </thead>\n",
-       "                <tbody>\n",
-       "                    \n",
-       "                    <tr>\n",
-       "                        <th> Shape </th>\n",
-       "                        <td> (nan, 1048576) </td>\n",
-       "                        <td> (nan, 1048576) </td>\n",
-       "                    </tr>\n",
-       "                    <tr>\n",
-       "                        <th> Count </th>\n",
-       "                        <td> 108 Tasks </td>\n",
-       "                        <td> 12 Chunks </td>\n",
-       "                    </tr>\n",
-       "                    <tr>\n",
-       "                    <th> Type </th>\n",
-       "                    <td> float64 </td>\n",
-       "                    <td> scipy.sparse._csr.csr_matrix </td>\n",
-       "                    </tr>\n",
-       "                </tbody>\n",
-       "            </table>\n",
-       "        </td>\n",
-       "        <td>\n",
-       "        \n",
-       "        </td>\n",
-       "    </tr>\n",
-       "</table>"
-      ],
-      "text/plain": [
-       "dask.array<_transformer, shape=(nan, 1048576), dtype=float64, chunksize=(nan, 1048576), chunktype=scipy.csr_matrix>"
-      ]
-     },
-     "execution_count": 35,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "X_train_vect"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "5d2f8cd8-2209-4561-a69e-917e56067279",
@@ -2293,16 +1893,6 @@
     "Now use scipy.sparse matrix as input for distributed XGBoostClassifier."
    ]
   },
-  {
-   "cell_type": "markdown",
-   "id": "829cea36-2e64-49e8-8b6d-428786c3cc00",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "## 5. Train XGBoost Model"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 39,
@@ -2605,17 +2195,6 @@
    "outputs": [],
    "source": []
   },
-  {
-   "cell_type": "markdown",
-   "id": "cc4b0fac-4dcf-4a94-89b1-01b37b6fd339",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "# TRYING OUT\n",
-    "Let's try to map the preprocessing functions over `b_flattened` without splitting the `description` from the `ratings`:"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 84,