From 5d84ca3e6928b2a87bd8134a5b22a151f0c22e6c Mon Sep 17 00:00:00 2001 From: JianghaoNing Date: Fri, 2 Aug 2024 07:45:58 +1000 Subject: [PATCH 01/11] removed incorrect transformation for xyz From 013734356130fa4ef717b395b65f74c43e679339 Mon Sep 17 00:00:00 2001 From: JianghaoNing Date: Thu, 15 Aug 2024 00:32:32 +1000 Subject: [PATCH 02/11] Update prerequisites notebook for Python tutorial --- tutorials/Python_PreReq_Notebook.ipynb | 421 ++++++++++++++++++++++++- 1 file changed, 416 insertions(+), 5 deletions(-) diff --git a/tutorials/Python_PreReq_Notebook.ipynb b/tutorials/Python_PreReq_Notebook.ipynb index f403755..4061975 100644 --- a/tutorials/Python_PreReq_Notebook.ipynb +++ b/tutorials/Python_PreReq_Notebook.ipynb @@ -349,10 +349,393 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "a3865f8a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "A module that was compiled using NumPy 1.x cannot be run in\n", + "NumPy 2.0.1 as it may crash. To support both 1.x and 2.x\n", + "versions of NumPy, modules must be compiled with NumPy 2.0.\n", + "Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.\n", + "\n", + "If you are a user of the module, the easiest solution will be to\n", + "downgrade to 'numpy<2' or try to upgrade the affected module.\n", + "We expect that some modules will need time to support NumPy 2.\n", + "\n", + "Traceback (most recent call last): File \"d:\\software\\Anaconda\\lib\\runpy.py\", line 197, in _run_module_as_main\n", + " return _run_code(code, main_globals, None,\n", + " File \"d:\\software\\Anaconda\\lib\\runpy.py\", line 87, in _run_code\n", + " exec(code, run_globals)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel_launcher.py\", line 16, in \n", + " app.launch_new_instance()\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\traitlets\\config\\application.py\", line 846, in launch_instance\n", + " app.start()\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelapp.py\", line 677, in start\n", + " self.io_loop.start()\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\tornado\\platform\\asyncio.py\", line 199, in start\n", + " self.asyncio_loop.run_forever()\n", + " File \"d:\\software\\Anaconda\\lib\\asyncio\\base_events.py\", line 596, in run_forever\n", + " self._run_once()\n", + " File \"d:\\software\\Anaconda\\lib\\asyncio\\base_events.py\", line 1890, in _run_once\n", + " handle._run()\n", + " File \"d:\\software\\Anaconda\\lib\\asyncio\\events.py\", line 80, in _run\n", + " self._context.run(self._callback, *self._args)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 457, in dispatch_queue\n", + " await self.process_one()\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 446, in process_one\n", + " await dispatch(*args)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 353, in dispatch_shell\n", + " await result\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 648, in execute_request\n", + " reply_content = await reply_content\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\ipkernel.py\", line 353, in do_execute\n", + " res = shell.run_cell(code, store_history=store_history, silent=silent)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\zmqshell.py\", line 533, in run_cell\n", + " return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 2901, in run_cell\n", + " result = self._run_cell(\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 2947, in _run_cell\n", + " return runner(coro)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\async_helpers.py\", line 68, in _pseudo_sync_runner\n", + " coro.send(None)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3172, in run_cell_async\n", + " has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3364, in run_ast_nodes\n", + " if (await self.run_code(code, result, async_=asy)):\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3444, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"C:\\Users\\61434\\AppData\\Local\\Temp/ipykernel_14460/18732846.py\", line 1, in \n", + " import pandas as pd\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\__init__.py\", line 77, in \n", + " from pandas.core.api import (\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\api.py\", line 28, in \n", + " from pandas.core.arrays import Categorical\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\arrays\\__init__.py\", line 1, in \n", + " from pandas.core.arrays.arrow import ArrowExtensionArray\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\arrays\\arrow\\__init__.py\", line 5, in \n", + " from pandas.core.arrays.arrow.array import ArrowExtensionArray\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\arrays\\arrow\\array.py\", line 50, in \n", + " from pandas.core import (\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\ops\\__init__.py\", line 8, in \n", + " from pandas.core.ops.array_ops import (\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\ops\\array_ops.py\", line 56, in \n", + " from pandas.core.computation import expressions\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\computation\\expressions.py\", line 21, in \n", + " from pandas.core.computation.check import NUMEXPR_INSTALLED\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\computation\\check.py\", line 5, in \n", + " ne = import_optional_dependency(\"numexpr\", errors=\"warn\")\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\compat\\_optional.py\", line 135, in import_optional_dependency\n", + " module = importlib.import_module(name)\n", + " File \"d:\\software\\Anaconda\\lib\\importlib\\__init__.py\", line 127, in import_module\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\numexpr\\__init__.py\", line 26, in \n", + " from numexpr.interpreter import MAX_THREADS, use_vml, __BLOCK_SIZE1__\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "_ARRAY_API not found", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;31mAttributeError\u001b[0m: _ARRAY_API not found" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "A module that was compiled using NumPy 1.x cannot be run in\n", + "NumPy 2.0.1 as it may crash. To support both 1.x and 2.x\n", + "versions of NumPy, modules must be compiled with NumPy 2.0.\n", + "Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.\n", + "\n", + "If you are a user of the module, the easiest solution will be to\n", + "downgrade to 'numpy<2' or try to upgrade the affected module.\n", + "We expect that some modules will need time to support NumPy 2.\n", + "\n", + "Traceback (most recent call last): File \"d:\\software\\Anaconda\\lib\\runpy.py\", line 197, in _run_module_as_main\n", + " return _run_code(code, main_globals, None,\n", + " File \"d:\\software\\Anaconda\\lib\\runpy.py\", line 87, in _run_code\n", + " exec(code, run_globals)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel_launcher.py\", line 16, in \n", + " app.launch_new_instance()\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\traitlets\\config\\application.py\", line 846, in launch_instance\n", + " app.start()\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelapp.py\", line 677, in start\n", + " self.io_loop.start()\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\tornado\\platform\\asyncio.py\", line 199, in start\n", + " self.asyncio_loop.run_forever()\n", + " File \"d:\\software\\Anaconda\\lib\\asyncio\\base_events.py\", line 596, in run_forever\n", + " self._run_once()\n", + " File \"d:\\software\\Anaconda\\lib\\asyncio\\base_events.py\", line 1890, in _run_once\n", + " handle._run()\n", + " File \"d:\\software\\Anaconda\\lib\\asyncio\\events.py\", line 80, in _run\n", + " self._context.run(self._callback, *self._args)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 457, in dispatch_queue\n", + " await self.process_one()\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 446, in process_one\n", + " await dispatch(*args)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 353, in dispatch_shell\n", + " await result\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 648, in execute_request\n", + " reply_content = await reply_content\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\ipkernel.py\", line 353, in do_execute\n", + " res = shell.run_cell(code, store_history=store_history, silent=silent)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\zmqshell.py\", line 533, in run_cell\n", + " return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 2901, in run_cell\n", + " result = self._run_cell(\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 2947, in _run_cell\n", + " return runner(coro)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\async_helpers.py\", line 68, in _pseudo_sync_runner\n", + " coro.send(None)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3172, in run_cell_async\n", + " has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3364, in run_ast_nodes\n", + " if (await self.run_code(code, result, async_=asy)):\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3444, in run_code\n", + " exec(code_obj, self.user_global_ns, self.user_ns)\n", + " File \"C:\\Users\\61434\\AppData\\Local\\Temp/ipykernel_14460/18732846.py\", line 1, in \n", + " import pandas as pd\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\__init__.py\", line 77, in \n", + " from pandas.core.api import (\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\api.py\", line 28, in \n", + " from pandas.core.arrays import Categorical\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\arrays\\__init__.py\", line 1, in \n", + " from pandas.core.arrays.arrow import ArrowExtensionArray\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\arrays\\arrow\\__init__.py\", line 5, in \n", + " from pandas.core.arrays.arrow.array import ArrowExtensionArray\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\arrays\\arrow\\array.py\", line 64, in \n", + " from pandas.core.arrays.masked import BaseMaskedArray\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\arrays\\masked.py\", line 60, in \n", + " from pandas.core import (\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\nanops.py\", line 52, in \n", + " bn = import_optional_dependency(\"bottleneck\", errors=\"warn\")\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\compat\\_optional.py\", line 135, in import_optional_dependency\n", + " module = importlib.import_module(name)\n", + " File \"d:\\software\\Anaconda\\lib\\importlib\\__init__.py\", line 127, in import_module\n", + " return _bootstrap._gcd_import(name[level:], package, level)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\bottleneck\\__init__.py\", line 2, in \n", + " from .reduce import (\n" + ] + }, + { + "ename": "AttributeError", + "evalue": "_ARRAY_API not found", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[1;31mAttributeError\u001b[0m: _ARRAY_API not found" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
VendorIDtpep_pickup_datetimetpep_dropoff_datetimepassenger_counttrip_distanceRatecodeIDstore_and_fwd_flagPULocationIDDOLocationIDpayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amountcongestion_surchargeAirport_fee
296461922024-01-31 23:45:592024-01-31 23:54:36NaN3.18NaNNone107263015.770.000.52.000.001.021.77NaNNaN
296462012024-01-31 23:13:072024-01-31 23:27:52NaN4.00NaNNone114236018.401.000.52.340.001.025.74NaNNaN
296462122024-01-31 23:19:002024-01-31 23:38:00NaN3.33NaNNone21125019.970.000.50.000.001.023.97NaNNaN
296462222024-01-31 23:07:232024-01-31 23:25:14NaN3.06NaNNone10713023.880.000.55.580.001.033.46NaNNaN
296462312024-01-31 23:58:252024-02-01 00:13:30NaN8.10NaNNone13875032.407.750.57.296.941.055.88NaNNaN
\n", + "
" + ], + "text/plain": [ + " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n", + "2964619 2 2024-01-31 23:45:59 2024-01-31 23:54:36 NaN \n", + "2964620 1 2024-01-31 23:13:07 2024-01-31 23:27:52 NaN \n", + "2964621 2 2024-01-31 23:19:00 2024-01-31 23:38:00 NaN \n", + "2964622 2 2024-01-31 23:07:23 2024-01-31 23:25:14 NaN \n", + "2964623 1 2024-01-31 23:58:25 2024-02-01 00:13:30 NaN \n", + "\n", + " trip_distance RatecodeID store_and_fwd_flag PULocationID \\\n", + "2964619 3.18 NaN None 107 \n", + "2964620 4.00 NaN None 114 \n", + "2964621 3.33 NaN None 211 \n", + "2964622 3.06 NaN None 107 \n", + "2964623 8.10 NaN None 138 \n", + "\n", + " DOLocationID payment_type fare_amount extra mta_tax tip_amount \\\n", + "2964619 263 0 15.77 0.00 0.5 2.00 \n", + "2964620 236 0 18.40 1.00 0.5 2.34 \n", + "2964621 25 0 19.97 0.00 0.5 0.00 \n", + "2964622 13 0 23.88 0.00 0.5 5.58 \n", + "2964623 75 0 32.40 7.75 0.5 7.29 \n", + "\n", + " tolls_amount improvement_surcharge total_amount \\\n", + "2964619 0.00 1.0 21.77 \n", + "2964620 0.00 1.0 25.74 \n", + "2964621 0.00 1.0 23.97 \n", + "2964622 0.00 1.0 33.46 \n", + "2964623 6.94 1.0 55.88 \n", + "\n", + " congestion_surcharge Airport_fee \n", + "2964619 NaN NaN \n", + "2964620 NaN NaN \n", + "2964621 NaN NaN \n", + "2964622 NaN NaN \n", + "2964623 NaN NaN " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "\n", @@ -458,10 +841,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "1b5a59dc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[91m\u001b[1mSomething went wrong. Reinstall and try again.\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Traceback (most recent call last):\n", + " File \"C:\\Users\\61434\\AppData\\Local\\Temp/ipykernel_14460/3853483516.py\", line 12, in \n", + " SparkSession.builder.appName(\"MAST30034 Tutorial\")\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pyspark\\sql\\session.py\", line 497, in getOrCreate\n", + " sc = SparkContext.getOrCreate(sparkConf)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pyspark\\context.py\", line 515, in getOrCreate\n", + " SparkContext(conf=conf or SparkConf())\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pyspark\\context.py\", line 201, in __init__\n", + " SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pyspark\\context.py\", line 436, in _ensure_initialized\n", + " SparkContext._gateway = gateway or launch_gateway(conf)\n", + " File \"d:\\software\\Anaconda\\lib\\site-packages\\pyspark\\java_gateway.py\", line 107, in launch_gateway\n", + " raise PySparkRuntimeError(\n", + "pyspark.errors.exceptions.base.PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.\n" + ] + } + ], "source": [ "import base64\n", "import traceback\n", @@ -503,7 +914,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.7" }, "latex_envs": { "LaTeX_envs_menu_present": true, From e3b4ed5be1fee5abb0797cf1c0e224fd8afe4bf8 Mon Sep 17 00:00:00 2001 From: JianghaoNing Date: Thu, 15 Aug 2024 00:42:01 +1000 Subject: [PATCH 03/11] Update Python_PreReq_Notebook.ipynb --- tutorials/Python_PreReq_Notebook.ipynb | 218 +------------------------ 1 file changed, 7 insertions(+), 211 deletions(-) diff --git a/tutorials/Python_PreReq_Notebook.ipynb b/tutorials/Python_PreReq_Notebook.ipynb index 4061975..8c08b19 100644 --- a/tutorials/Python_PreReq_Notebook.ipynb +++ b/tutorials/Python_PreReq_Notebook.ipynb @@ -153,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "id": "52550d58", "metadata": { "ExecuteTime": { @@ -185,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "id": "bbd9bd35", "metadata": { "ExecuteTime": { @@ -240,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "id": "14395028", "metadata": { "ExecuteTime": { @@ -349,194 +349,10 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "id": "a3865f8a", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "A module that was compiled using NumPy 1.x cannot be run in\n", - "NumPy 2.0.1 as it may crash. To support both 1.x and 2.x\n", - "versions of NumPy, modules must be compiled with NumPy 2.0.\n", - "Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.\n", - "\n", - "If you are a user of the module, the easiest solution will be to\n", - "downgrade to 'numpy<2' or try to upgrade the affected module.\n", - "We expect that some modules will need time to support NumPy 2.\n", - "\n", - "Traceback (most recent call last): File \"d:\\software\\Anaconda\\lib\\runpy.py\", line 197, in _run_module_as_main\n", - " return _run_code(code, main_globals, None,\n", - " File \"d:\\software\\Anaconda\\lib\\runpy.py\", line 87, in _run_code\n", - " exec(code, run_globals)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel_launcher.py\", line 16, in \n", - " app.launch_new_instance()\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\traitlets\\config\\application.py\", line 846, in launch_instance\n", - " app.start()\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelapp.py\", line 677, in start\n", - " self.io_loop.start()\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\tornado\\platform\\asyncio.py\", line 199, in start\n", - " self.asyncio_loop.run_forever()\n", - " File \"d:\\software\\Anaconda\\lib\\asyncio\\base_events.py\", line 596, in run_forever\n", - " self._run_once()\n", - " File \"d:\\software\\Anaconda\\lib\\asyncio\\base_events.py\", line 1890, in _run_once\n", - " handle._run()\n", - " File \"d:\\software\\Anaconda\\lib\\asyncio\\events.py\", line 80, in _run\n", - " self._context.run(self._callback, *self._args)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 457, in dispatch_queue\n", - " await self.process_one()\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 446, in process_one\n", - " await dispatch(*args)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 353, in dispatch_shell\n", - " await result\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 648, in execute_request\n", - " reply_content = await reply_content\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\ipkernel.py\", line 353, in do_execute\n", - " res = shell.run_cell(code, store_history=store_history, silent=silent)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\zmqshell.py\", line 533, in run_cell\n", - " return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 2901, in run_cell\n", - " result = self._run_cell(\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 2947, in _run_cell\n", - " return runner(coro)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\async_helpers.py\", line 68, in _pseudo_sync_runner\n", - " coro.send(None)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3172, in run_cell_async\n", - " has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3364, in run_ast_nodes\n", - " if (await self.run_code(code, result, async_=asy)):\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3444, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"C:\\Users\\61434\\AppData\\Local\\Temp/ipykernel_14460/18732846.py\", line 1, in \n", - " import pandas as pd\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\__init__.py\", line 77, in \n", - " from pandas.core.api import (\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\api.py\", line 28, in \n", - " from pandas.core.arrays import Categorical\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\arrays\\__init__.py\", line 1, in \n", - " from pandas.core.arrays.arrow import ArrowExtensionArray\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\arrays\\arrow\\__init__.py\", line 5, in \n", - " from pandas.core.arrays.arrow.array import ArrowExtensionArray\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\arrays\\arrow\\array.py\", line 50, in \n", - " from pandas.core import (\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\ops\\__init__.py\", line 8, in \n", - " from pandas.core.ops.array_ops import (\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\ops\\array_ops.py\", line 56, in \n", - " from pandas.core.computation import expressions\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\computation\\expressions.py\", line 21, in \n", - " from pandas.core.computation.check import NUMEXPR_INSTALLED\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\computation\\check.py\", line 5, in \n", - " ne = import_optional_dependency(\"numexpr\", errors=\"warn\")\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\compat\\_optional.py\", line 135, in import_optional_dependency\n", - " module = importlib.import_module(name)\n", - " File \"d:\\software\\Anaconda\\lib\\importlib\\__init__.py\", line 127, in import_module\n", - " return _bootstrap._gcd_import(name[level:], package, level)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\numexpr\\__init__.py\", line 26, in \n", - " from numexpr.interpreter import MAX_THREADS, use_vml, __BLOCK_SIZE1__\n" - ] - }, - { - "ename": "AttributeError", - "evalue": "_ARRAY_API not found", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;31mAttributeError\u001b[0m: _ARRAY_API not found" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "A module that was compiled using NumPy 1.x cannot be run in\n", - "NumPy 2.0.1 as it may crash. To support both 1.x and 2.x\n", - "versions of NumPy, modules must be compiled with NumPy 2.0.\n", - "Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.\n", - "\n", - "If you are a user of the module, the easiest solution will be to\n", - "downgrade to 'numpy<2' or try to upgrade the affected module.\n", - "We expect that some modules will need time to support NumPy 2.\n", - "\n", - "Traceback (most recent call last): File \"d:\\software\\Anaconda\\lib\\runpy.py\", line 197, in _run_module_as_main\n", - " return _run_code(code, main_globals, None,\n", - " File \"d:\\software\\Anaconda\\lib\\runpy.py\", line 87, in _run_code\n", - " exec(code, run_globals)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel_launcher.py\", line 16, in \n", - " app.launch_new_instance()\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\traitlets\\config\\application.py\", line 846, in launch_instance\n", - " app.start()\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelapp.py\", line 677, in start\n", - " self.io_loop.start()\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\tornado\\platform\\asyncio.py\", line 199, in start\n", - " self.asyncio_loop.run_forever()\n", - " File \"d:\\software\\Anaconda\\lib\\asyncio\\base_events.py\", line 596, in run_forever\n", - " self._run_once()\n", - " File \"d:\\software\\Anaconda\\lib\\asyncio\\base_events.py\", line 1890, in _run_once\n", - " handle._run()\n", - " File \"d:\\software\\Anaconda\\lib\\asyncio\\events.py\", line 80, in _run\n", - " self._context.run(self._callback, *self._args)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 457, in dispatch_queue\n", - " await self.process_one()\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 446, in process_one\n", - " await dispatch(*args)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 353, in dispatch_shell\n", - " await result\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\kernelbase.py\", line 648, in execute_request\n", - " reply_content = await reply_content\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\ipkernel.py\", line 353, in do_execute\n", - " res = shell.run_cell(code, store_history=store_history, silent=silent)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\ipykernel\\zmqshell.py\", line 533, in run_cell\n", - " return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 2901, in run_cell\n", - " result = self._run_cell(\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 2947, in _run_cell\n", - " return runner(coro)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\async_helpers.py\", line 68, in _pseudo_sync_runner\n", - " coro.send(None)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3172, in run_cell_async\n", - " has_raised = await self.run_ast_nodes(code_ast.body, cell_name,\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3364, in run_ast_nodes\n", - " if (await self.run_code(code, result, async_=asy)):\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\IPython\\core\\interactiveshell.py\", line 3444, in run_code\n", - " exec(code_obj, self.user_global_ns, self.user_ns)\n", - " File \"C:\\Users\\61434\\AppData\\Local\\Temp/ipykernel_14460/18732846.py\", line 1, in \n", - " import pandas as pd\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\__init__.py\", line 77, in \n", - " from pandas.core.api import (\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\api.py\", line 28, in \n", - " from pandas.core.arrays import Categorical\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\arrays\\__init__.py\", line 1, in \n", - " from pandas.core.arrays.arrow import ArrowExtensionArray\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\arrays\\arrow\\__init__.py\", line 5, in \n", - " from pandas.core.arrays.arrow.array import ArrowExtensionArray\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\arrays\\arrow\\array.py\", line 64, in \n", - " from pandas.core.arrays.masked import BaseMaskedArray\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\arrays\\masked.py\", line 60, in \n", - " from pandas.core import (\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\core\\nanops.py\", line 52, in \n", - " bn = import_optional_dependency(\"bottleneck\", errors=\"warn\")\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pandas\\compat\\_optional.py\", line 135, in import_optional_dependency\n", - " module = importlib.import_module(name)\n", - " File \"d:\\software\\Anaconda\\lib\\importlib\\__init__.py\", line 127, in import_module\n", - " return _bootstrap._gcd_import(name[level:], package, level)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\bottleneck\\__init__.py\", line 2, in \n", - " from .reduce import (\n" - ] - }, - { - "ename": "AttributeError", - "evalue": "_ARRAY_API not found", - "output_type": "error", - "traceback": [ - "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[1;31mAttributeError\u001b[0m: _ARRAY_API not found" - ] - }, { "data": { "text/html": [ @@ -731,7 +547,7 @@ "2964623 NaN NaN " ] }, - "execution_count": 3, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -841,7 +657,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 7, "id": "1b5a59dc", "metadata": {}, "outputs": [ @@ -849,27 +665,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "\u001b[91m\u001b[1mSomething went wrong. Reinstall and try again.\u001b[0m\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Traceback (most recent call last):\n", - " File \"C:\\Users\\61434\\AppData\\Local\\Temp/ipykernel_14460/3853483516.py\", line 12, in \n", - " SparkSession.builder.appName(\"MAST30034 Tutorial\")\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pyspark\\sql\\session.py\", line 497, in getOrCreate\n", - " sc = SparkContext.getOrCreate(sparkConf)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pyspark\\context.py\", line 515, in getOrCreate\n", - " SparkContext(conf=conf or SparkConf())\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pyspark\\context.py\", line 201, in __init__\n", - " SparkContext._ensure_initialized(self, gateway=gateway, conf=conf)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pyspark\\context.py\", line 436, in _ensure_initialized\n", - " SparkContext._gateway = gateway or launch_gateway(conf)\n", - " File \"d:\\software\\Anaconda\\lib\\site-packages\\pyspark\\java_gateway.py\", line 107, in launch_gateway\n", - " raise PySparkRuntimeError(\n", - "pyspark.errors.exceptions.base.PySparkRuntimeError: [JAVA_GATEWAY_EXITED] Java gateway process exited before sending its port number.\n" + "\u001b[92m\u001b[1mSuccess! Your environment is set up and you are ready for the first workshop.\u001b[0m\n" ] } ], From 1166e7394b948f9b46e11002e0c998cdca6c293e Mon Sep 17 00:00:00 2001 From: JianghaoNing Date: Thu, 15 Aug 2024 11:24:30 +1000 Subject: [PATCH 04/11] Update Python_PreReq_Notebook.ipynb --- tutorials/Python_PreReq_Notebook.ipynb | 225 ++++++++++++++++++++++++- 1 file changed, 216 insertions(+), 9 deletions(-) diff --git a/tutorials/Python_PreReq_Notebook.ipynb b/tutorials/Python_PreReq_Notebook.ipynb index f403755..e6cbf68 100644 --- a/tutorials/Python_PreReq_Notebook.ipynb +++ b/tutorials/Python_PreReq_Notebook.ipynb @@ -153,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "52550d58", "metadata": { "ExecuteTime": { @@ -185,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "bbd9bd35", "metadata": { "ExecuteTime": { @@ -222,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "afb62526", "metadata": { "ExecuteTime": { @@ -240,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "14395028", "metadata": { "ExecuteTime": { @@ -349,10 +349,209 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "a3865f8a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
VendorIDtpep_pickup_datetimetpep_dropoff_datetimepassenger_counttrip_distanceRatecodeIDstore_and_fwd_flagPULocationIDDOLocationIDpayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amountcongestion_surchargeAirport_fee
296461922024-01-31 23:45:592024-01-31 23:54:36NaN3.18NaNNone107263015.770.000.52.000.001.021.77NaNNaN
296462012024-01-31 23:13:072024-01-31 23:27:52NaN4.00NaNNone114236018.401.000.52.340.001.025.74NaNNaN
296462122024-01-31 23:19:002024-01-31 23:38:00NaN3.33NaNNone21125019.970.000.50.000.001.023.97NaNNaN
296462222024-01-31 23:07:232024-01-31 23:25:14NaN3.06NaNNone10713023.880.000.55.580.001.033.46NaNNaN
296462312024-01-31 23:58:252024-02-01 00:13:30NaN8.10NaNNone13875032.407.750.57.296.941.055.88NaNNaN
\n", + "
" + ], + "text/plain": [ + " VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count \\\n", + "2964619 2 2024-01-31 23:45:59 2024-01-31 23:54:36 NaN \n", + "2964620 1 2024-01-31 23:13:07 2024-01-31 23:27:52 NaN \n", + "2964621 2 2024-01-31 23:19:00 2024-01-31 23:38:00 NaN \n", + "2964622 2 2024-01-31 23:07:23 2024-01-31 23:25:14 NaN \n", + "2964623 1 2024-01-31 23:58:25 2024-02-01 00:13:30 NaN \n", + "\n", + " trip_distance RatecodeID store_and_fwd_flag PULocationID \\\n", + "2964619 3.18 NaN None 107 \n", + "2964620 4.00 NaN None 114 \n", + "2964621 3.33 NaN None 211 \n", + "2964622 3.06 NaN None 107 \n", + "2964623 8.10 NaN None 138 \n", + "\n", + " DOLocationID payment_type fare_amount extra mta_tax tip_amount \\\n", + "2964619 263 0 15.77 0.00 0.5 2.00 \n", + "2964620 236 0 18.40 1.00 0.5 2.34 \n", + "2964621 25 0 19.97 0.00 0.5 0.00 \n", + "2964622 13 0 23.88 0.00 0.5 5.58 \n", + "2964623 75 0 32.40 7.75 0.5 7.29 \n", + "\n", + " tolls_amount improvement_surcharge total_amount \\\n", + "2964619 0.00 1.0 21.77 \n", + "2964620 0.00 1.0 25.74 \n", + "2964621 0.00 1.0 23.97 \n", + "2964622 0.00 1.0 33.46 \n", + "2964623 6.94 1.0 55.88 \n", + "\n", + " congestion_surcharge Airport_fee \n", + "2964619 NaN NaN \n", + "2964620 NaN NaN \n", + "2964621 NaN NaN \n", + "2964622 NaN NaN \n", + "2964623 NaN NaN " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import pandas as pd\n", "\n", @@ -458,10 +657,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "1b5a59dc", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[92m\u001b[1mSuccess! Your environment is set up and you are ready for the first workshop.\u001b[0m\n" + ] + } + ], "source": [ "import base64\n", "import traceback\n", @@ -503,7 +710,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.7" }, "latex_envs": { "LaTeX_envs_menu_present": true, From 4d04aee75c6e5e57ab9e92217f4e97c70724bce6 Mon Sep 17 00:00:00 2001 From: JianghaoNing Date: Thu, 22 Aug 2024 03:56:30 +1000 Subject: [PATCH 05/11] 111 --- tutorials/Python_PreReq_Notebook.ipynb | 42 ++++++-------------------- 1 file changed, 9 insertions(+), 33 deletions(-) diff --git a/tutorials/Python_PreReq_Notebook.ipynb b/tutorials/Python_PreReq_Notebook.ipynb index b1252ac..a4d2883 100644 --- a/tutorials/Python_PreReq_Notebook.ipynb +++ b/tutorials/Python_PreReq_Notebook.ipynb @@ -153,11 +153,7 @@ }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 2, -======= "execution_count": 4, ->>>>>>> e3b4ed5be1fee5abb0797cf1c0e224fd8afe4bf8 "id": "52550d58", "metadata": { "ExecuteTime": { @@ -189,11 +185,7 @@ }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 3, -======= "execution_count": 5, ->>>>>>> e3b4ed5be1fee5abb0797cf1c0e224fd8afe4bf8 "id": "bbd9bd35", "metadata": { "ExecuteTime": { @@ -230,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "id": "afb62526", "metadata": { "ExecuteTime": { @@ -248,11 +240,7 @@ }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 5, -======= - "execution_count": 2, ->>>>>>> e3b4ed5be1fee5abb0797cf1c0e224fd8afe4bf8 + "execution_count": 7, "id": "14395028", "metadata": { "ExecuteTime": { @@ -268,7 +256,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "613317b5", "metadata": { "ExecuteTime": { @@ -361,11 +349,7 @@ }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 7, -======= - "execution_count": 6, ->>>>>>> e3b4ed5be1fee5abb0797cf1c0e224fd8afe4bf8 + "execution_count": 3, "id": "a3865f8a", "metadata": {}, "outputs": [ @@ -563,11 +547,7 @@ "2964623 NaN NaN " ] }, -<<<<<<< HEAD - "execution_count": 7, -======= - "execution_count": 6, ->>>>>>> e3b4ed5be1fee5abb0797cf1c0e224fd8afe4bf8 + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -677,11 +657,7 @@ }, { "cell_type": "code", -<<<<<<< HEAD - "execution_count": 8, -======= - "execution_count": 7, ->>>>>>> e3b4ed5be1fee5abb0797cf1c0e224fd8afe4bf8 + "execution_count": 2, "id": "1b5a59dc", "metadata": {}, "outputs": [ @@ -720,9 +696,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python (myenv)", "language": "python", - "name": "python3" + "name": "myenv" }, "language_info": { "codemirror_mode": { @@ -734,7 +710,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.7" + "version": "3.12.5" }, "latex_envs": { "LaTeX_envs_menu_present": true, From 5c568840d4c74923b840c5a90e5e62e91d2c0bae Mon Sep 17 00:00:00 2001 From: JianghaoNing Date: Fri, 23 Aug 2024 10:23:49 +1000 Subject: [PATCH 06/11] change --- tutorials/Python_PreReq_Notebook.ipynb | 6 +- tutorials/tute_1/Tute1_Python.ipynb | 387 ++++++++++++++++++++++--- tutorials/tute_2/Tute2_Python.ipynb | 10 +- 3 files changed, 355 insertions(+), 48 deletions(-) diff --git a/tutorials/Python_PreReq_Notebook.ipynb b/tutorials/Python_PreReq_Notebook.ipynb index a4d2883..5bfe443 100644 --- a/tutorials/Python_PreReq_Notebook.ipynb +++ b/tutorials/Python_PreReq_Notebook.ipynb @@ -349,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 9, "id": "a3865f8a", "metadata": {}, "outputs": [ @@ -547,7 +547,7 @@ "2964623 NaN NaN " ] }, - "execution_count": 3, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -657,7 +657,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "id": "1b5a59dc", "metadata": {}, "outputs": [ diff --git a/tutorials/tute_1/Tute1_Python.ipynb b/tutorials/tute_1/Tute1_Python.ipynb index 617e32a..5619a2a 100644 --- a/tutorials/tute_1/Tute1_Python.ipynb +++ b/tutorials/tute_1/Tute1_Python.ipynb @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -87,14 +87,50 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:02:52.129150Z", "start_time": "2023-07-24T04:02:45.036746Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-RECORD 0------------------------------------\n", + " VendorID | 2 \n", + " tpep_pickup_datetime | 2024-01-01 00:57:55 \n", + " tpep_dropoff_datetime | 2024-01-01 01:17:43 \n", + " passenger_count | 1 \n", + " trip_distance | 1.72 \n", + " RatecodeID | 1 \n", + " store_and_fwd_flag | N \n", + " PULocationID | 186 \n", + " DOLocationID | 79 \n", + " payment_type | 2 \n", + " fare_amount | 17.7 \n", + " extra | 1.0 \n", + " mta_tax | 0.5 \n", + " tip_amount | 0.0 \n", + " tolls_amount | 0.0 \n", + " improvement_surcharge | 1.0 \n", + " total_amount | 22.7 \n", + " congestion_surcharge | 2.5 \n", + " Airport_fee | 0.0 \n", + "only showing top 1 row\n", + "\n" + ] + } + ], "source": [ "# sdf = spark df = spark data frame\n", "sdf = spark.read.parquet('../../data/tlc_data/2024-01.parquet')\n", @@ -112,14 +148,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:02.224352Z", "start_time": "2023-07-24T04:02:59.918648Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
VendorIDtpep_pickup_datetimetpep_dropoff_datetimepassenger_counttrip_distanceRatecodeIDstore_and_fwd_flagPULocationIDDOLocationIDpayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amountcongestion_surchargeAirport_fee
22024-01-01 00:57:552024-01-01 01:17:4311.721N18679217.71.00.50.00.01.022.72.50.0
12024-01-01 00:03:002024-01-01 00:09:3611.81N140236110.03.50.53.750.01.018.752.50.0
12024-01-01 00:17:062024-01-01 00:35:0114.71N23679123.33.50.53.00.01.031.32.50.0
12024-01-01 00:36:382024-01-01 00:44:5611.41N79211110.03.50.52.00.01.017.02.50.0
12024-01-01 00:46:512024-01-01 00:52:5710.81N21114817.93.50.53.20.01.016.12.50.0
\n" + ], + "text/plain": [ + "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+\n", + "|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|\n", + "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+\n", + "| 2| 2024-01-01 00:57:55| 2024-01-01 01:17:43| 1| 1.72| 1| N| 186| 79| 2| 17.7| 1.0| 0.5| 0.0| 0.0| 1.0| 22.7| 2.5| 0.0|\n", + "| 1| 2024-01-01 00:03:00| 2024-01-01 00:09:36| 1| 1.8| 1| N| 140| 236| 1| 10.0| 3.5| 0.5| 3.75| 0.0| 1.0| 18.75| 2.5| 0.0|\n", + "| 1| 2024-01-01 00:17:06| 2024-01-01 00:35:01| 1| 4.7| 1| N| 236| 79| 1| 23.3| 3.5| 0.5| 3.0| 0.0| 1.0| 31.3| 2.5| 0.0|\n", + "| 1| 2024-01-01 00:36:38| 2024-01-01 00:44:56| 1| 1.4| 1| N| 79| 211| 1| 10.0| 3.5| 0.5| 2.0| 0.0| 1.0| 17.0| 2.5| 0.0|\n", + "| 1| 2024-01-01 00:46:51| 2024-01-01 00:52:57| 1| 0.8| 1| N| 211| 148| 1| 7.9| 3.5| 0.5| 3.2| 0.0| 1.0| 16.1| 2.5| 0.0|\n", + "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "sdf.limit(5)" ] @@ -133,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:02.413366Z", @@ -155,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:05.617652Z", @@ -169,7 +234,7 @@ "(2964624, 9554778)" ] }, - "execution_count": 16, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -191,28 +256,67 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:05.638872Z", "start_time": "2023-07-24T04:03:05.619077Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "root\n", + " |-- VendorID: integer (nullable = true)\n", + " |-- tpep_pickup_datetime: timestamp_ntz (nullable = true)\n", + " |-- tpep_dropoff_datetime: timestamp_ntz (nullable = true)\n", + " |-- passenger_count: long (nullable = true)\n", + " |-- trip_distance: double (nullable = true)\n", + " |-- RatecodeID: long (nullable = true)\n", + " |-- store_and_fwd_flag: string (nullable = true)\n", + " |-- PULocationID: integer (nullable = true)\n", + " |-- DOLocationID: integer (nullable = true)\n", + " |-- payment_type: long (nullable = true)\n", + " |-- fare_amount: double (nullable = true)\n", + " |-- extra: double (nullable = true)\n", + " |-- mta_tax: double (nullable = true)\n", + " |-- tip_amount: double (nullable = true)\n", + " |-- tolls_amount: double (nullable = true)\n", + " |-- improvement_surcharge: double (nullable = true)\n", + " |-- total_amount: double (nullable = true)\n", + " |-- congestion_surcharge: double (nullable = true)\n", + " |-- Airport_fee: double (nullable = true)\n", + "\n" + ] + } + ], "source": [ "sdf.printSchema()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:05.731117Z", "start_time": "2023-07-24T04:03:05.723079Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "StructType([StructField('VendorID', IntegerType(), True), StructField('tpep_pickup_datetime', TimestampNTZType(), True), StructField('tpep_dropoff_datetime', TimestampNTZType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('RatecodeID', LongType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('PULocationID', IntegerType(), True), StructField('DOLocationID', IntegerType(), True), StructField('payment_type', LongType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True), StructField('Airport_fee', DoubleType(), True)])" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "sdf.schema" ] @@ -242,14 +346,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:09.238597Z", "start_time": "2023-07-24T04:03:08.260421Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
passenger_count
1
1
1
1
1
\n" + ], + "text/plain": [ + "+---------------+\n", + "|passenger_count|\n", + "+---------------+\n", + "| 1|\n", + "| 1|\n", + "| 1|\n", + "| 1|\n", + "| 1|\n", + "+---------------+" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "sdf.select('passenger_count').limit(5)" ] @@ -263,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:13.527228Z", @@ -289,7 +422,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:15.415531Z", @@ -303,14 +436,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:15.932836Z", "start_time": "2023-07-24T04:03:15.595811Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Column<'passenger_count'>" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "F.col(\"passenger_count\")" ] @@ -324,14 +468,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:18.276168Z", "start_time": "2023-07-24T04:03:16.125585Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
VendorIDtpep_pickup_datetimetpep_dropoff_datetimepassenger_counttrip_distanceRatecodeIDstore_and_fwd_flagPULocationIDDOLocationIDpayment_typefare_amountextramta_taxtip_amounttolls_amountimprovement_surchargetotal_amountcongestion_surchargeAirport_fee
22024-01-01 00:14:522024-01-01 00:17:3850.551N23914315.11.00.51.520.01.011.622.50.0
22024-01-01 00:22:062024-01-01 00:51:2853.11N143170126.81.00.56.360.01.038.162.50.0
22024-01-01 00:11:222024-01-01 00:20:1451.631N246246111.41.00.54.920.01.021.322.50.0
22024-01-01 00:17:212024-01-01 00:23:2251.311N22914027.91.00.50.00.01.012.92.50.0
22024-01-01 00:17:172024-01-01 00:23:1150.941N14423117.91.00.50.00.01.012.92.50.0
\n" + ], + "text/plain": [ + "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+\n", + "|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|Airport_fee|\n", + "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+\n", + "| 2| 2024-01-01 00:14:52| 2024-01-01 00:17:38| 5| 0.55| 1| N| 239| 143| 1| 5.1| 1.0| 0.5| 1.52| 0.0| 1.0| 11.62| 2.5| 0.0|\n", + "| 2| 2024-01-01 00:22:06| 2024-01-01 00:51:28| 5| 3.1| 1| N| 143| 170| 1| 26.8| 1.0| 0.5| 6.36| 0.0| 1.0| 38.16| 2.5| 0.0|\n", + "| 2| 2024-01-01 00:11:22| 2024-01-01 00:20:14| 5| 1.63| 1| N| 246| 246| 1| 11.4| 1.0| 0.5| 4.92| 0.0| 1.0| 21.32| 2.5| 0.0|\n", + "| 2| 2024-01-01 00:17:21| 2024-01-01 00:23:22| 5| 1.31| 1| N| 229| 140| 2| 7.9| 1.0| 0.5| 0.0| 0.0| 1.0| 12.9| 2.5| 0.0|\n", + "| 2| 2024-01-01 00:17:17| 2024-01-01 00:23:11| 5| 0.94| 1| N| 144| 231| 1| 7.9| 1.0| 0.5| 0.0| 0.0| 1.0| 12.9| 2.5| 0.0|\n", + "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "sdf.filter(F.col('passenger_count') == 5).limit(5)" ] @@ -345,7 +518,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:19.219000Z", @@ -368,14 +541,43 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:23.940677Z", "start_time": "2023-07-24T04:03:21.475197Z" } }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
passenger_countavg(trip_distance)
02.7438750993167025
72.29375
62.951688811345235
53.0734722139318347
13.1375658449911463
\n" + ], + "text/plain": [ + "+---------------+------------------+\n", + "|passenger_count|avg(trip_distance)|\n", + "+---------------+------------------+\n", + "| 0|2.7438750993167025|\n", + "| 7| 2.29375|\n", + "| 6| 2.951688811345235|\n", + "| 5|3.0734722139318347|\n", + "| 1|3.1375658449911463|\n", + "+---------------+------------------+" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "sdf.groupby('passenger_count').mean('trip_distance').limit(5)" ] @@ -391,7 +593,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:25.388514Z", @@ -399,7 +601,30 @@ }, "code_folding": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+-------------------+-----------------------+\n", + "|passenger_count|avg_trip_amount_usd|max_trip_distance_miles|\n", + "+---------------+-------------------+-----------------------+\n", + "| NULL| 25.811736633327225| 312722.3|\n", + "| 0| 25.327816939456696| 60.2|\n", + "| 1| 26.20523044549061| 15400.32|\n", + "| 2| 29.5206599309276| 277.4|\n", + "| 3| 29.138309044289365| 83.92|\n", + "| 4| 30.877266710278| 120.78|\n", + "| 5| 26.26912911120415| 120.86|\n", + "| 6| 25.801183286359887| 44.52|\n", + "| 7| 57.735| 11.96|\n", + "| 8| 95.66803921568629| 17.74|\n", + "| 9| 18.45| 1.8|\n", + "+---------------+-------------------+-----------------------+\n", + "\n" + ] + } + ], "source": [ "aggregated_results = sdf \\\n", " .groupBy(\"passenger_count\") \\\n", @@ -422,7 +647,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:27.419466Z", @@ -447,14 +672,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:27.775017Z", "start_time": "2023-07-24T04:03:27.420416Z" } }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "+---------------+-------------------+-----------------------+\n", + "|passenger_count|avg_trip_amount_usd|max_trip_distance_miles|\n", + "+---------------+-------------------+-----------------------+\n", + "| NULL| 25.811736633327225| 312722.3|\n", + "| 0| 25.327816939456696| 60.2|\n", + "| 1| 26.20523044549061| 15400.32|\n", + "| 2| 29.5206599309276| 277.4|\n", + "| 3| 29.138309044289365| 83.92|\n", + "| 4| 30.877266710278| 120.78|\n", + "| 5| 26.26912911120415| 120.86|\n", + "| 6| 25.801183286359887| 44.52|\n", + "| 7| 57.735| 11.96|\n", + "| 8| 95.66803921568629| 17.74|\n", + "| 9| 18.45| 1.8|\n", + "+---------------+-------------------+-----------------------+\n", + "\n" + ] + } + ], "source": [ "temp_results = spark.read.parquet('../../data/tute_data/aggregated_results')\n", "temp_results.show()" @@ -522,7 +770,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 32, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:28.560339Z", @@ -536,7 +784,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:34.788025Z", @@ -544,7 +792,21 @@ }, "scrolled": true }, - "outputs": [], + "outputs": [ + { + "ename": "ModuleNotFoundError", + "evalue": "No module named 'distutils'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[34], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43msdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mSAMPLE_SIZE\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mseed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtoPandas\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m df\u001b[38;5;241m.\u001b[39mto_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../../data/tute_data/sample_data.csv\u001b[39m\u001b[38;5;124m'\u001b[39m, index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n", + "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pyspark/sql/pandas/conversion.py:86\u001b[0m, in \u001b[0;36mPandasConversionMixin.toPandas\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpyspark\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msql\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtypes\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _create_converter_to_pandas\n\u001b[1;32m 84\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpyspark\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msql\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_minimum_pandas_version\n\u001b[0;32m---> 86\u001b[0m \u001b[43mrequire_minimum_pandas_version\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m 90\u001b[0m jconf \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msparkSession\u001b[38;5;241m.\u001b[39m_jconf\n", + "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pyspark/sql/pandas/utils.py:24\u001b[0m, in \u001b[0;36mrequire_minimum_pandas_version\u001b[0;34m()\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;66;03m# TODO(HyukjinKwon): Relocate and deduplicate the version specification.\u001b[39;00m\n\u001b[1;32m 22\u001b[0m minimum_pandas_version \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m1.0.5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdistutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversion\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LooseVersion\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\n", + "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'distutils'" + ] + } + ], "source": [ "import pandas as pd\n", "df = sdf.sample(SAMPLE_SIZE, seed=0).toPandas()\n", @@ -553,14 +815,26 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 28, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:34.855142Z", "start_time": "2023-07-24T04:03:34.789165Z" } }, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'df' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[28], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdf\u001b[49m\u001b[38;5;241m.\u001b[39mto_parquet(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../../data/tute_data/sample_data.parquet\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", + "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" + ] + } + ], "source": [ "df.to_parquet('../../data/tute_data/sample_data.parquet')" ] @@ -576,14 +850,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:35.029098Z", "start_time": "2023-07-24T04:03:34.856046Z" } }, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '../../data/tute_data/sample_data.csv'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m:1\u001b[0m\n", + "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 1014\u001b[0m dialect,\n\u001b[1;32m 1015\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 1023\u001b[0m )\n\u001b[1;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", + "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", + "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 874\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 875\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 876\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 877\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 878\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 879\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../../data/tute_data/sample_data.csv'" + ] + } + ], "source": [ "%%time\n", "df_csv = pd.read_csv('../../data/tute_data/sample_data.csv')" @@ -591,14 +882,30 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:35.040185Z", "start_time": "2023-07-24T04:03:35.030786Z" } }, - "outputs": [], + "outputs": [ + { + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: '../../data/tute_data/sample_data.parquet'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "File \u001b[0;32m:1\u001b[0m\n", + "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/parquet.py:667\u001b[0m, in \u001b[0;36mread_parquet\u001b[0;34m(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)\u001b[0m\n\u001b[1;32m 664\u001b[0m use_nullable_dtypes \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 665\u001b[0m check_dtype_backend(dtype_backend)\n\u001b[0;32m--> 667\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mimpl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 668\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 669\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 670\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 671\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 672\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_nullable_dtypes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_nullable_dtypes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 673\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype_backend\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype_backend\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 674\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilesystem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 675\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 676\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/parquet.py:267\u001b[0m, in \u001b[0;36mPyArrowImpl.read\u001b[0;34m(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)\u001b[0m\n\u001b[1;32m 264\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m manager \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marray\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 265\u001b[0m to_pandas_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msplit_blocks\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m \u001b[38;5;66;03m# type: ignore[assignment]\u001b[39;00m\n\u001b[0;32m--> 267\u001b[0m path_or_handle, handles, filesystem \u001b[38;5;241m=\u001b[39m \u001b[43m_get_path_or_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 268\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 269\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 270\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 271\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 272\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 274\u001b[0m pa_table \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapi\u001b[38;5;241m.\u001b[39mparquet\u001b[38;5;241m.\u001b[39mread_table(\n\u001b[1;32m 275\u001b[0m path_or_handle,\n\u001b[1;32m 276\u001b[0m columns\u001b[38;5;241m=\u001b[39mcolumns,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 280\u001b[0m )\n", + "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/parquet.py:140\u001b[0m, in \u001b[0;36m_get_path_or_handle\u001b[0;34m(path, fs, storage_options, mode, is_dir)\u001b[0m\n\u001b[1;32m 130\u001b[0m handles \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 132\u001b[0m \u001b[38;5;129;01mnot\u001b[39;00m fs\n\u001b[1;32m 133\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_dir\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 138\u001b[0m \u001b[38;5;66;03m# fsspec resources can also point to directories\u001b[39;00m\n\u001b[1;32m 139\u001b[0m \u001b[38;5;66;03m# this branch is used for example when reading from non-fsspec URLs\u001b[39;00m\n\u001b[0;32m--> 140\u001b[0m handles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 141\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath_or_handle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\n\u001b[1;32m 142\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 143\u001b[0m fs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 144\u001b[0m path_or_handle \u001b[38;5;241m=\u001b[39m handles\u001b[38;5;241m.\u001b[39mhandle\n", + "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/common.py:882\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m 874\u001b[0m handle,\n\u001b[1;32m 875\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 878\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 879\u001b[0m )\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m--> 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 883\u001b[0m handles\u001b[38;5;241m.\u001b[39mappend(handle)\n\u001b[1;32m 885\u001b[0m \u001b[38;5;66;03m# Convert BytesIO or file objects passed with an encoding\u001b[39;00m\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../../data/tute_data/sample_data.parquet'" + ] + } + ], "source": [ "%%time\n", "df_parquet = pd.read_parquet('../../data/tute_data/sample_data.parquet')" @@ -705,9 +1012,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python (myenv)", "language": "python", - "name": "python3" + "name": "myenv" }, "language_info": { "codemirror_mode": { @@ -719,7 +1026,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.5" }, "latex_envs": { "LaTeX_envs_menu_present": true, diff --git a/tutorials/tute_2/Tute2_Python.ipynb b/tutorials/tute_2/Tute2_Python.ipynb index 8933bdb..48637b7 100644 --- a/tutorials/tute_2/Tute2_Python.ipynb +++ b/tutorials/tute_2/Tute2_Python.ipynb @@ -99,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T03:53:44.620316Z", @@ -291,7 +291,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\r", + "\r\n", " \r" ] } @@ -4158,9 +4158,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python (myenv)", "language": "python", - "name": "python3" + "name": "myenv" }, "language_info": { "codemirror_mode": { @@ -4172,7 +4172,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.5" }, "latex_envs": { "LaTeX_envs_menu_present": true, From 17b17cd6066fd44e9c4e2061a023b18705839059 Mon Sep 17 00:00:00 2001 From: JianghaoNing Date: Wed, 4 Sep 2024 19:27:17 +1000 Subject: [PATCH 07/11] complete --- tutorials/Python_PreReq_Notebook.ipynb | 9 ++------- tutorials/tute_1/Tute1_Python.ipynb | 9 ++------- 2 files changed, 4 insertions(+), 14 deletions(-) diff --git a/tutorials/Python_PreReq_Notebook.ipynb b/tutorials/Python_PreReq_Notebook.ipynb index 5bfe443..4b89f59 100644 --- a/tutorials/Python_PreReq_Notebook.ipynb +++ b/tutorials/Python_PreReq_Notebook.ipynb @@ -696,9 +696,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (myenv)", + "display_name": "myenv", "language": "python", - "name": "myenv" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -771,11 +771,6 @@ "_Feature" ], "window_display": false - }, - "vscode": { - "interpreter": { - "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" - } } }, "nbformat": 4, diff --git a/tutorials/tute_1/Tute1_Python.ipynb b/tutorials/tute_1/Tute1_Python.ipynb index 5619a2a..aeed35a 100644 --- a/tutorials/tute_1/Tute1_Python.ipynb +++ b/tutorials/tute_1/Tute1_Python.ipynb @@ -1012,9 +1012,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python (myenv)", + "display_name": "myenv", "language": "python", - "name": "myenv" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1087,11 +1087,6 @@ "_Feature" ], "window_display": false - }, - "vscode": { - "interpreter": { - "hash": "1c6f7b18ea35922dad1f927d5d0123541ee5478d7a9729c6a2c6ed680be427a0" - } } }, "nbformat": 4, From 946e14c3878c04ef29449750846d2733cda9e998 Mon Sep 17 00:00:00 2001 From: JianghaoNing Date: Wed, 4 Sep 2024 19:29:49 +1000 Subject: [PATCH 08/11] 1 --- tutorials/Python_PreReq_Notebook.ipynb | 16 +-- tutorials/tute_1/Tute1_Python.ipynb | 143 ++++++++++--------------- 2 files changed, 62 insertions(+), 97 deletions(-) diff --git a/tutorials/Python_PreReq_Notebook.ipynb b/tutorials/Python_PreReq_Notebook.ipynb index 4b89f59..f844a38 100644 --- a/tutorials/Python_PreReq_Notebook.ipynb +++ b/tutorials/Python_PreReq_Notebook.ipynb @@ -153,7 +153,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "id": "52550d58", "metadata": { "ExecuteTime": { @@ -185,7 +185,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 8, "id": "bbd9bd35", "metadata": { "ExecuteTime": { @@ -222,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 9, "id": "afb62526", "metadata": { "ExecuteTime": { @@ -240,7 +240,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 10, "id": "14395028", "metadata": { "ExecuteTime": { @@ -256,7 +256,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 11, "id": "613317b5", "metadata": { "ExecuteTime": { @@ -349,7 +349,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "id": "a3865f8a", "metadata": {}, "outputs": [ @@ -547,7 +547,7 @@ "2964623 NaN NaN " ] }, - "execution_count": 9, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -657,7 +657,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 15, "id": "1b5a59dc", "metadata": {}, "outputs": [ diff --git a/tutorials/tute_1/Tute1_Python.ipynb b/tutorials/tute_1/Tute1_Python.ipynb index aeed35a..8845fd7 100644 --- a/tutorials/tute_1/Tute1_Python.ipynb +++ b/tutorials/tute_1/Tute1_Python.ipynb @@ -35,7 +35,20 @@ "cell_type": "code", "execution_count": 2, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "24/09/04 19:26:00 WARN Utils: Your hostname, ZackdeMacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.102 instead (on interface en0)\n", + "24/09/04 19:26:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", + "Setting default log level to \"WARN\".\n", + "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", + "24/09/04 19:26:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", + "24/09/04 19:26:01 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n" + ] + } + ], "source": [ "from pyspark.sql import SparkSession\n", "\n", @@ -87,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 3, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:02:52.129150Z", @@ -95,13 +108,6 @@ } }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " \r" - ] - }, { "name": "stdout", "output_type": "stream", @@ -148,7 +154,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:02.224352Z", @@ -180,7 +186,7 @@ "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+" ] }, - "execution_count": 6, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -198,7 +204,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:02.413366Z", @@ -220,7 +226,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:05.617652Z", @@ -234,7 +240,7 @@ "(2964624, 9554778)" ] }, - "execution_count": 8, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -256,7 +262,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:05.638872Z", @@ -298,7 +304,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:05.731117Z", @@ -312,7 +318,7 @@ "StructType([StructField('VendorID', IntegerType(), True), StructField('tpep_pickup_datetime', TimestampNTZType(), True), StructField('tpep_dropoff_datetime', TimestampNTZType(), True), StructField('passenger_count', LongType(), True), StructField('trip_distance', DoubleType(), True), StructField('RatecodeID', LongType(), True), StructField('store_and_fwd_flag', StringType(), True), StructField('PULocationID', IntegerType(), True), StructField('DOLocationID', IntegerType(), True), StructField('payment_type', LongType(), True), StructField('fare_amount', DoubleType(), True), StructField('extra', DoubleType(), True), StructField('mta_tax', DoubleType(), True), StructField('tip_amount', DoubleType(), True), StructField('tolls_amount', DoubleType(), True), StructField('improvement_surcharge', DoubleType(), True), StructField('total_amount', DoubleType(), True), StructField('congestion_surcharge', DoubleType(), True), StructField('Airport_fee', DoubleType(), True)])" ] }, - "execution_count": 10, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -346,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:09.238597Z", @@ -378,7 +384,7 @@ "+---------------+" ] }, - "execution_count": 11, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -396,7 +402,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:13.527228Z", @@ -422,7 +428,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:15.415531Z", @@ -436,7 +442,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:15.932836Z", @@ -450,7 +456,7 @@ "Column<'passenger_count'>" ] }, - "execution_count": 14, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -468,7 +474,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:18.276168Z", @@ -500,7 +506,7 @@ "+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+" ] }, - "execution_count": 15, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -518,7 +524,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:19.219000Z", @@ -541,7 +547,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:23.940677Z", @@ -573,7 +579,7 @@ "+---------------+------------------+" ] }, - "execution_count": 17, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -593,7 +599,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 16, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:25.388514Z", @@ -647,7 +653,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:27.419466Z", @@ -672,7 +678,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:27.775017Z", @@ -770,7 +776,7 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": 19, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:28.560339Z", @@ -784,7 +790,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": 20, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:34.788025Z", @@ -792,21 +798,7 @@ }, "scrolled": true }, - "outputs": [ - { - "ename": "ModuleNotFoundError", - "evalue": "No module named 'distutils'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[34], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df \u001b[38;5;241m=\u001b[39m \u001b[43msdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mSAMPLE_SIZE\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mseed\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtoPandas\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3\u001b[0m df\u001b[38;5;241m.\u001b[39mto_csv(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../../data/tute_data/sample_data.csv\u001b[39m\u001b[38;5;124m'\u001b[39m, index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n", - "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pyspark/sql/pandas/conversion.py:86\u001b[0m, in \u001b[0;36mPandasConversionMixin.toPandas\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 83\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpyspark\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msql\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mtypes\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m _create_converter_to_pandas\n\u001b[1;32m 84\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mpyspark\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msql\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mpandas\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m require_minimum_pandas_version\n\u001b[0;32m---> 86\u001b[0m \u001b[43mrequire_minimum_pandas_version\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 88\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m \u001b[38;5;21;01mpd\u001b[39;00m\n\u001b[1;32m 90\u001b[0m jconf \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39msparkSession\u001b[38;5;241m.\u001b[39m_jconf\n", - "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pyspark/sql/pandas/utils.py:24\u001b[0m, in \u001b[0;36mrequire_minimum_pandas_version\u001b[0;34m()\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[38;5;66;03m# TODO(HyukjinKwon): Relocate and deduplicate the version specification.\u001b[39;00m\n\u001b[1;32m 22\u001b[0m minimum_pandas_version \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m1.0.5\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m---> 24\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mdistutils\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mversion\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m LooseVersion\n\u001b[1;32m 26\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 27\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mpandas\u001b[39;00m\n", - "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'distutils'" - ] - } - ], + "outputs": [], "source": [ "import pandas as pd\n", "df = sdf.sample(SAMPLE_SIZE, seed=0).toPandas()\n", @@ -815,26 +807,14 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 21, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:34.855142Z", "start_time": "2023-07-24T04:03:34.789165Z" } }, - "outputs": [ - { - "ename": "NameError", - "evalue": "name 'df' is not defined", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[28], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdf\u001b[49m\u001b[38;5;241m.\u001b[39mto_parquet(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m../../data/tute_data/sample_data.parquet\u001b[39m\u001b[38;5;124m'\u001b[39m)\n", - "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined" - ] - } - ], + "outputs": [], "source": [ "df.to_parquet('../../data/tute_data/sample_data.parquet')" ] @@ -850,7 +830,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 22, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:35.029098Z", @@ -859,19 +839,11 @@ }, "outputs": [ { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: '../../data/tute_data/sample_data.csv'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m:1\u001b[0m\n", - "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1026\u001b[0m, in \u001b[0;36mread_csv\u001b[0;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, date_format, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options, dtype_backend)\u001b[0m\n\u001b[1;32m 1013\u001b[0m kwds_defaults \u001b[38;5;241m=\u001b[39m _refine_defaults_read(\n\u001b[1;32m 1014\u001b[0m dialect,\n\u001b[1;32m 1015\u001b[0m delimiter,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1022\u001b[0m dtype_backend\u001b[38;5;241m=\u001b[39mdtype_backend,\n\u001b[1;32m 1023\u001b[0m )\n\u001b[1;32m 1024\u001b[0m kwds\u001b[38;5;241m.\u001b[39mupdate(kwds_defaults)\n\u001b[0;32m-> 1026\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43m_read\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:620\u001b[0m, in \u001b[0;36m_read\u001b[0;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[1;32m 617\u001b[0m _validate_names(kwds\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mnames\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m))\n\u001b[1;32m 619\u001b[0m \u001b[38;5;66;03m# Create the parser.\u001b[39;00m\n\u001b[0;32m--> 620\u001b[0m parser \u001b[38;5;241m=\u001b[39m \u001b[43mTextFileReader\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfilepath_or_buffer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwds\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 622\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m chunksize \u001b[38;5;129;01mor\u001b[39;00m iterator:\n\u001b[1;32m 623\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m parser\n", - "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1620\u001b[0m, in \u001b[0;36mTextFileReader.__init__\u001b[0;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[1;32m 1617\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moptions[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m kwds[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhas_index_names\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m 1619\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles: IOHandles \u001b[38;5;241m|\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 1620\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_engine \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_make_engine\u001b[49m\u001b[43m(\u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mengine\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/parsers/readers.py:1880\u001b[0m, in \u001b[0;36mTextFileReader._make_engine\u001b[0;34m(self, f, engine)\u001b[0m\n\u001b[1;32m 1878\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1879\u001b[0m mode \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m-> 1880\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1881\u001b[0m \u001b[43m \u001b[49m\u001b[43mf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1882\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1883\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1884\u001b[0m \u001b[43m \u001b[49m\u001b[43mcompression\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcompression\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1885\u001b[0m \u001b[43m \u001b[49m\u001b[43mmemory_map\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmemory_map\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1886\u001b[0m \u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mis_text\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1887\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mencoding_errors\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstrict\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1888\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptions\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mstorage_options\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1889\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1890\u001b[0m \u001b[38;5;28;01massert\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 1891\u001b[0m f \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mhandles\u001b[38;5;241m.\u001b[39mhandle\n", - "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/common.py:873\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 868\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(handle, \u001b[38;5;28mstr\u001b[39m):\n\u001b[1;32m 869\u001b[0m \u001b[38;5;66;03m# Check whether the filename is to be opened in binary mode.\u001b[39;00m\n\u001b[1;32m 870\u001b[0m \u001b[38;5;66;03m# Binary mode does not support 'encoding' and 'newline'.\u001b[39;00m\n\u001b[1;32m 871\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mencoding \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m ioargs\u001b[38;5;241m.\u001b[39mmode:\n\u001b[1;32m 872\u001b[0m \u001b[38;5;66;03m# Encoding\u001b[39;00m\n\u001b[0;32m--> 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\n\u001b[1;32m 874\u001b[0m \u001b[43m \u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 875\u001b[0m \u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 876\u001b[0m \u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 877\u001b[0m \u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 878\u001b[0m \u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 879\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[1;32m 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(handle, ioargs\u001b[38;5;241m.\u001b[39mmode)\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../../data/tute_data/sample_data.csv'" + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 109 ms, sys: 11.4 ms, total: 120 ms\n", + "Wall time: 120 ms\n" ] } ], @@ -882,7 +854,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 23, "metadata": { "ExecuteTime": { "end_time": "2023-07-24T04:03:35.040185Z", @@ -891,18 +863,11 @@ }, "outputs": [ { - "ename": "FileNotFoundError", - "evalue": "[Errno 2] No such file or directory: '../../data/tute_data/sample_data.parquet'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", - "File \u001b[0;32m:1\u001b[0m\n", - "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/parquet.py:667\u001b[0m, in \u001b[0;36mread_parquet\u001b[0;34m(path, engine, columns, storage_options, use_nullable_dtypes, dtype_backend, filesystem, filters, **kwargs)\u001b[0m\n\u001b[1;32m 664\u001b[0m use_nullable_dtypes \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 665\u001b[0m check_dtype_backend(dtype_backend)\n\u001b[0;32m--> 667\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mimpl\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mread\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 668\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 669\u001b[0m \u001b[43m \u001b[49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 670\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilters\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilters\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 671\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 672\u001b[0m \u001b[43m \u001b[49m\u001b[43muse_nullable_dtypes\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_nullable_dtypes\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 673\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtype_backend\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdtype_backend\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 674\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfilesystem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 675\u001b[0m \u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 676\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/parquet.py:267\u001b[0m, in \u001b[0;36mPyArrowImpl.read\u001b[0;34m(self, path, columns, filters, use_nullable_dtypes, dtype_backend, storage_options, filesystem, **kwargs)\u001b[0m\n\u001b[1;32m 264\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m manager \u001b[38;5;241m==\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124marray\u001b[39m\u001b[38;5;124m\"\u001b[39m:\n\u001b[1;32m 265\u001b[0m to_pandas_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124msplit_blocks\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m \u001b[38;5;66;03m# type: ignore[assignment]\u001b[39;00m\n\u001b[0;32m--> 267\u001b[0m path_or_handle, handles, filesystem \u001b[38;5;241m=\u001b[39m \u001b[43m_get_path_or_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 268\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 269\u001b[0m \u001b[43m \u001b[49m\u001b[43mfilesystem\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 270\u001b[0m \u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 271\u001b[0m \u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 272\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 273\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 274\u001b[0m pa_table \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapi\u001b[38;5;241m.\u001b[39mparquet\u001b[38;5;241m.\u001b[39mread_table(\n\u001b[1;32m 275\u001b[0m path_or_handle,\n\u001b[1;32m 276\u001b[0m columns\u001b[38;5;241m=\u001b[39mcolumns,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs,\n\u001b[1;32m 280\u001b[0m )\n", - "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/parquet.py:140\u001b[0m, in \u001b[0;36m_get_path_or_handle\u001b[0;34m(path, fs, storage_options, mode, is_dir)\u001b[0m\n\u001b[1;32m 130\u001b[0m handles \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 132\u001b[0m \u001b[38;5;129;01mnot\u001b[39;00m fs\n\u001b[1;32m 133\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_dir\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 138\u001b[0m \u001b[38;5;66;03m# fsspec resources can also point to directories\u001b[39;00m\n\u001b[1;32m 139\u001b[0m \u001b[38;5;66;03m# this branch is used for example when reading from non-fsspec URLs\u001b[39;00m\n\u001b[0;32m--> 140\u001b[0m handles \u001b[38;5;241m=\u001b[39m \u001b[43mget_handle\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 141\u001b[0m \u001b[43m \u001b[49m\u001b[43mpath_or_handle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mis_text\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mFalse\u001b[39;49;00m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstorage_options\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstorage_options\u001b[49m\n\u001b[1;32m 142\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 143\u001b[0m fs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 144\u001b[0m path_or_handle \u001b[38;5;241m=\u001b[39m handles\u001b[38;5;241m.\u001b[39mhandle\n", - "File \u001b[0;32m~/myenv/lib/python3.12/site-packages/pandas/io/common.py:882\u001b[0m, in \u001b[0;36mget_handle\u001b[0;34m(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)\u001b[0m\n\u001b[1;32m 873\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\n\u001b[1;32m 874\u001b[0m handle,\n\u001b[1;32m 875\u001b[0m ioargs\u001b[38;5;241m.\u001b[39mmode,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 878\u001b[0m newline\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 879\u001b[0m )\n\u001b[1;32m 880\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 881\u001b[0m \u001b[38;5;66;03m# Binary mode\u001b[39;00m\n\u001b[0;32m--> 882\u001b[0m handle \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mopen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mhandle\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mioargs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmode\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 883\u001b[0m handles\u001b[38;5;241m.\u001b[39mappend(handle)\n\u001b[1;32m 885\u001b[0m \u001b[38;5;66;03m# Convert BytesIO or file objects passed with an encoding\u001b[39;00m\n", - "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: '../../data/tute_data/sample_data.parquet'" + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 20.3 ms, sys: 15 ms, total: 35.3 ms\n", + "Wall time: 13.6 ms\n" ] } ], From 6688b3df4f2854520b9dd78e5e446cca001ea8b0 Mon Sep 17 00:00:00 2001 From: JianghaoNing Date: Wed, 4 Sep 2024 20:03:56 +1000 Subject: [PATCH 09/11] 1 --- plots/foliumChoroplethMap.html | 76 ++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 31 deletions(-) diff --git a/plots/foliumChoroplethMap.html b/plots/foliumChoroplethMap.html index ca2fc7b..e9b7d39 100644 --- a/plots/foliumChoroplethMap.html +++ b/plots/foliumChoroplethMap.html @@ -1,5 +1,7 @@ - + + + - - + + + - - - - + + + + - + + -
+
- \ No newline at end of file + + geo_json_0abb6ba563cab65a8238dc20463d9c5c.addTo(choropleth_ab38675c85c8cd729255dcfdc98d9552); + + + choropleth_ab38675c85c8cd729255dcfdc98d9552.addTo(map_52cb4f46b9d9a9291780e49848db5fbd); + + + \ No newline at end of file From b6923d8dc468327988001d3798f6d7a3e0fd41e7 Mon Sep 17 00:00:00 2001 From: JianghaoNing Date: Wed, 4 Sep 2024 20:30:45 +1000 Subject: [PATCH 10/11] 1 --- plots/foliumChoroplethMap.html | 34 +- tutorials/tute_2/Tute2_Python.ipynb | 2683 ++++++++++++++------------- 2 files changed, 1457 insertions(+), 1260 deletions(-) diff --git a/plots/foliumChoroplethMap.html b/plots/foliumChoroplethMap.html index e9b7d39..3e4da9c 100644 --- a/plots/foliumChoroplethMap.html +++ b/plots/foliumChoroplethMap.html @@ -25,7 +25,7 @@