diff --git a/local/to_datetime_code.ipynb b/local/to_datetime_code.ipynb new file mode 100644 index 0000000..d04b0f2 --- /dev/null +++ b/local/to_datetime_code.ipynb @@ -0,0 +1,340 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "ec38f3d5-6fb5-4975-818a-1d5a7243a136", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from io import StringIO\n", + "\n", + "from dask.dataframe import from_pandas\n", + "from pandas import read_csv\n", + "\n", + "data = StringIO(\n", + " \"\"\"timestamp_start,time_worked\n", + " 2021-01-01 9:25 AM,3 hours 12 minutes\n", + " 2021-02-03 4:25 PM,2 hours\n", + " 2021-03-05 1:25 PM,15 minutes\n", + " 2021-03-05 11:25 PM,55 minutes\n", + " \"\"\"\n", + ")\n", + "df = read_csv(data)\n", + "ddf = from_pandas(df, npartitions=2)\n", + "\n", + "print(ddf.dtypes)\n", + "# timestamp_start object\n", + "# time_worked object\n", + "# dtype: object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57882342-cc3b-47c8-92c4-b1dcc960772e", + "metadata": {}, + "outputs": [], + "source": [ + "from dask.dataframe import to_datetime\n", + "\n", + "ddf[\"converted_timestamp_start\"] = to_datetime(ddf[\"timestamp_start\"])\n", + "\n", + "print(ddf.dtypes)\n", + "# timestamp_start object\n", + "# time_worked object\n", + "# converted_timestamp_start datetime64[ns]\n", + "# dtype: object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "40457ca3-d467-46a0-b617-68b0d9587e29", + "metadata": {}, + "outputs": [], + "source": [ + "ddf[\"day_of_week\"] = ddf[\"converted_timestamp_start\"].dt.dayofweek\n", + "\n", + "print(ddf[[\"converted_timestamp_start\", \"day_of_week\"]].compute())\n", + "# converted_timestamp_start day_of_week\n", + "# 0 2021-01-01 09:25:00 4\n", + "# 1 2021-02-03 16:25:00 2\n", + "# 2 2021-03-05 13:25:00 4\n", + "# 3 2021-03-05 23:25:00 4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80dd5017-32cb-4a45-90f4-3d520f84f0cd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from pandas import Timedelta, to_timedelta\n", + "\n", + "ddf[\"converted_time_worked\"] = (\n", + " ddf[\"time_worked\"].apply(lambda x: to_timedelta(x), meta=Timedelta).compute()\n", + ")\n", + "\n", + "print(ddf[[\"converted_timestamp_start\", \"converted_time_worked\"]].compute())\n", + "# converted_timestamp_start converted_time_worked\n", + "# 0 2021-01-01 09:25:00 0 days 03:12:00\n", + "# 1 2021-02-03 16:25:00 0 days 02:00:00\n", + "# 2 2021-03-05 13:25:00 0 days 00:15:00\n", + "# 3 2021-03-05 23:25:00 0 days 00:55:00" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "61d13f66-c822-4607-a603-d33b12ae4c73", + "metadata": {}, + "outputs": [], + "source": [ + "ddf[\"work_completed\"] = ddf[\"converted_timestamp_start\"] + ddf[\"converted_time_worked\"]\n", + "\n", + "print(\n", + " ddf[\n", + " [\"converted_timestamp_start\", \"converted_time_worked\", \"work_completed\"]\n", + " ].compute()\n", + ")\n", + "# converted_timestamp_start converted_time_worked work_completed\n", + "# 0 2021-01-01 09:25:00 0 days 03:12:00 2021-01-01 12:37:00\n", + "# 1 2021-02-03 16:25:00 0 days 02:00:00 2021-02-03 18:25:00\n", + "# 2 2021-03-05 13:25:00 0 days 00:15:00 2021-03-05 13:40:00\n", + "# 3 2021-03-05 23:25:00 0 days 00:55:00 2021-03-06 00:20:00" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c1d19aa-f31e-45d9-8a59-84fe94fe5ecb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ddf[\"converted_timestamp_start\"].dt.floor(\"15 min\").compute()\n", + "# 0 2021-01-01 09:15:00\n", + "# 1 2021-02-03 16:15:00\n", + "# 2 2021-03-05 13:15:00\n", + "# 3 2021-03-05 23:15:00\n", + "# Name: converted_timestamp_start, dtype: datetime64[ns]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7532fbcf-132e-4135-98e2-3449fca86a5d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from io import StringIO\n", + "\n", + "from pandas import read_csv\n", + "\n", + "data = StringIO(\n", + " \"\"\"timestamp_start,time_worked\n", + " 2021-01-01 9:25 AM,3 hours 12 minutes\n", + " 2021-02-03 4:25 PM,2 hours\n", + " missing ,15 minutes\n", + " 2021-03-05 11:?? PM,55 minutes\n", + " \"\"\"\n", + ")\n", + "\n", + "df = read_csv(data)\n", + "ddf = from_pandas(df, npartitions=2)\n", + "\n", + "print(ddf.dtypes)\n", + "# timestamp_start object\n", + "# time_worked object\n", + "# dtype: object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "540f36d1-b64c-4ba7-9051-ee5af9df3ea8", + "metadata": {}, + "outputs": [], + "source": [ + "print(to_datetime(ddf[\"timestamp_start\"], errors=\"coerce\").compute())\n", + "# 0 2021-01-01 09:25:00\n", + "# 1 2021-02-03 14:25:00\n", + "# 2 NaT\n", + "# 3 NaT\n", + "# dtype: datetime64[ns]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c55679c-0c4f-4c5e-8456-c6778ade481d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "print(to_datetime(ddf[\"timestamp_start\"], errors=\"ignore\").compute())\n", + "# 0 2021-01-01 09:25:00\n", + "# 1 2021-02-03 16:25:00\n", + "# 2 missing\n", + "# 3 2021-03-05 11:?? PM\n", + "# dtype: object" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0016b448-9635-4e36-a407-f9e1ed39cbea", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from io import StringIO\n", + "\n", + "from dask.dataframe import from_pandas, to_datetime\n", + "from pandas import read_csv\n", + "\n", + "data = StringIO(\n", + " \"\"\"timestamp_start,time_worked\n", + "2021-01-01 9:25 AM,3 hours 12 minutes\n", + "\"Thursday, October 9, 2022 14:25\",2 hours\n", + "\"January 12, 2022 14:25\",15 minutes\n", + " \"\"\"\n", + ")\n", + "\n", + "df = read_csv(data)\n", + "ddf = from_pandas(df, npartitions=2)\n", + "\n", + "ddf[\"converted_timestamp_start\"] = to_datetime(ddf[\"timestamp_start\"])\n", + "print(ddf[[\"timestamp_start\", \"converted_timestamp_start\"]].compute())\n", + "# timestamp_start converted_timestamp_start\n", + "# 0 2021-01-01 9:25 AM 2021-01-01 09:25:00\n", + "# 1 Thursday, October 9, 2022 14:25 2022-10-09 14:25:00\n", + "# 2 January 12, 2022 14:25 2022-01-12 14:25:00" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29f40937-9bea-499a-a3cb-c17c14b7be71", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from io import StringIO\n", + "\n", + "from dask.dataframe import from_pandas, to_datetime\n", + "from pandas import read_csv\n", + "\n", + "data = StringIO(\n", + " \"\"\"timestamp_start,time_worked\n", + "year 2021: 01/01 9:25 AM,3 hours 12 minutes\n", + "year 2021: 01/03 3:25 PM,2 hours\n", + "year 2021: 01/05 11:25 AM,2 hours\n", + "\"\"\"\n", + ")\n", + "df = read_csv(data)\n", + "ddf = from_pandas(df, npartitions=2)\n", + "\n", + "ddf[\"converted_timestamp_start\"] = to_datetime(\n", + " ddf[\"timestamp_start\"], format=\"year %Y: %m/%d %I:%M %p\"\n", + ")\n", + "print(ddf[[\"timestamp_start\", \"converted_timestamp_start\"]].compute())\n", + "# timestamp_start converted_timestamp_start\n", + "# 0 year 2021: 01/01 9:25 AM 2021-01-01 09:25:00\n", + "# 1 year 2021: 01/03 3:25 PM 2021-01-03 15:25:00\n", + "# 2 year 2021: 01/05 11:25 AM 2021-01-05 11:25:00" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "110ce98e-d8bf-4501-a767-064f6bee56a3", + "metadata": {}, + "outputs": [], + "source": [ + "from io import StringIO\n", + "\n", + "from dask.dataframe import from_pandas, to_datetime\n", + "from pandas import read_csv\n", + "\n", + "data = StringIO(\n", + " \"\"\"timestamp_local,location\n", + "2021-01-01 09:01:12,Asia/Almaty\n", + "2021-01-01 09:01:12,Europe/London\n", + "2021-01-01 09:01:12,America/New_York\n", + "\"\"\"\n", + ")\n", + "df = read_csv(data)\n", + "ddf = from_pandas(df, npartitions=2)\n", + "\n", + "ddf[\"converted_date\"] = to_datetime(ddf[\"timestamp_local\"], utc=False)\n", + "print(ddf[[\"timestamp_local\", \"converted_date\"]].compute())\n", + "# timestamp_local converted_date\n", + "# 0 2021-01-01 09:01:12 2021-01-01 09:01:12\n", + "# 1 2021-01-01 09:01:12 2021-01-01 09:01:12\n", + "# 2 2021-01-01 09:01:12 2021-01-01 09:01:12" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2234c76-cf8d-47ed-bf9e-05a3c299f120", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def convert_tz(datetime_object, local_timezone):\n", + " timezone_aware = datetime_object.tz_localize(local_timezone)\n", + " timezone_est = timezone_aware.tz_convert(\"America/New_York\")\n", + " return timezone_est\n", + "\n", + "\n", + "ddf[\"converted_date_tz_aware\"] = ddf[[\"converted_date\", \"location\"]].apply(\n", + " lambda row: convert_tz(row[\"converted_date\"], row[\"location\"]),\n", + " axis=1,\n", + " meta=(\"converted_date_tz_aware\", \"float\"),\n", + ")\n", + "\n", + "print(ddf[[\"location\", \"converted_date_tz_aware\"]].compute())\n", + "# location converted_date_tz_aware\n", + "# 0 Asia/Almaty 2020-12-31 22:01:12-05:00\n", + "# 1 Europe/London 2021-01-01 04:01:12-05:00\n", + "# 2 America/New_York 2021-01-01 09:01:12-05:00" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}